C x86_64/sha3-permute.asm ifelse(` Copyright (C) 2012 Niels Möller This file is part of GNU Nettle. GNU Nettle is free software: you can redistribute it and/or modify it under the terms of either: * the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. or * the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. or both in parallel, as here. GNU Nettle is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. ') define(`CTX', `%rdi') C 25 64-bit values, 200 bytes. define(`COUNT', `%r8') C Avoid clobbering %rsi, for W64. define(`A00', `%rax') define(`A0102', `%xmm0') define(`A0304', `%xmm1') define(`A05', `%rcx') define(`A0607', `%xmm2') define(`A0809', `%xmm3') define(`A10', `%rdx') define(`A1112', `%xmm4') define(`A1314', `%xmm5') define(`A15', `%rbp') define(`A1617', `%xmm6') define(`A1819', `%xmm7') define(`A20', `%r9') define(`A2122', `%xmm8') define(`A2324', `%xmm9') define(`C0', `%r10') define(`C12', `%xmm10') define(`C34', `%xmm11') define(`D0', `%r11') define(`D12', `%xmm12') define(`D34', `%xmm13') C Wide temporaries define(`W0', `%xmm14') define(`W1', `%xmm15') define(`W2', `%xmm12') C Overlap D12 define(`W3', `%xmm13') C Overlap D34 define(`T0', `%r12') define(`T1', `%r13') define(`T2', `%r11') C Overlap D0 define(`T3', `%r10') C Overlap C0 define(`RC', `%r14') define(`OFFSET', `ifelse($1,0,,eval(8*$1))') define(`STATE', `OFFSET($1)(CTX)') define(`SWAP64', `pshufd `$'0x4e,') define(`DIRECT_MOVQ', `no') C MOVQ(src, dst), for moves between a general register and an xmm C register. ifelse(DIRECT_MOVQ, yes, ` C movq calls that are equal to the corresponding movd, C where the Apple assembler requires them to be written as movd. define(`MOVQ', `movd $1, $2') ', ` C Moving via (cached) memory is generally faster. define(`MOVQ', ` movq $1, (CTX) movq (CTX), $2 ')') C ROTL64(rot, register, temp) C Caller needs to or together the result. define(`ROTL64', ` movdqa $2, $3 psllq `$'$1, $2 psrlq `$'eval(64-$1), $3 ') .file "sha3-permute.asm" C sha3_permute(struct sha3_state *ctx) .text ALIGN(16) PROLOGUE(nettle_sha3_permute) W64_ENTRY(1, 16) push %rbp push %r12 push %r13 push %r14 movl $24, XREG(COUNT) lea .rc-8(%rip), RC movq STATE(0), A00 movups STATE(1), A0102 movups STATE(3), A0304 movq A00, C0 movq STATE(5), A05 movdqa A0102, C12 movups STATE(6), A0607 movdqa A0304, C34 movups STATE(8), A0809 xorq A05, C0 movq STATE(10), A10 pxor A0607, C12 movups STATE(11), A1112 pxor A0809, C34 movups STATE(13), A1314 xorq A10, C0 movq STATE(15), A15 pxor A1112, C12 movups STATE(16), A1617 pxor A1314, C34 movups STATE(18), A1819 xorq A15, C0 movq STATE(20), A20 pxor A1617, C12 movups STATE(21), A2122 pxor A1819, C34 movups STATE(23), A2324 xorq A20, C0 pxor A2122, C12 pxor A2324, C34 ALIGN(16) .Loop: C The theta step. Combine parity bits, then xor to state. C D0 = C4 ^ (C1 <<< 1) C D1 = C0 ^ (C2 <<< 1) C D2 = C1 ^ (C3 <<< 1) C D3 = C2 ^ (C4 <<< 1) C D4 = C3 ^ (C0 <<< 1) C Shift the words around, putting (C0, C1) in D12, (C2, C3) in C D34, and (C4, C0) in C34. C Notes on "unpack" instructions: C punpckhqdq 01, 23 gives 31 C punpcklqdq 01, 23 gives 20 SWAP64 C34, C34 C Holds C4, C3 movdqa C12, D34 MOVQ(C0, D12) punpcklqdq C12, D12 C Holds C0, C1 punpckhqdq C34, D34 C Holds C2, C3 punpcklqdq D12, C34 C Holds C4, C0 MOVQ(C34, D0) MOVQ(C12, T0) rolq $1, T0 xorq T0, D0 C Can use C12 as temporary movdqa D34, W0 movdqa D34, W1 psllq $1, W0 psrlq $63, W1 pxor W0, D12 pxor W1, D12 C Done D12 movdqa C34, C12 psrlq $63, C34 psllq $1, C12 pxor C34, D34 pxor C12, D34 C Done D34 xorq D0, A00 xorq D0, A05 xorq D0, A10 xorq D0, A15 xorq D0, A20 pxor D12, A0102 pxor D12, A0607 pxor D12, A1112 pxor D12, A1617 pxor D12, A2122 pxor D34, A0304 pxor D34, A0809 pxor D34, A1314 pxor D34, A1819 pxor D34, A2324 C theta step done, no C, D or W temporaries alive. C rho and pi steps. When doing the permutations, also C transpose the matrix. C The combined permutation + transpose gives the following C cycles (rotation counts in parenthesis) C 0 <- 0(0) C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1) C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36) C 7 <- 7(6) C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3) C 14 <- 14(39) C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41) C 16 <- 16(45) C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18) C 23 <- 23(56) C Do the 1,2,3,4 row. First rotate, then permute. movdqa A0102, W0 movdqa A0102, W1 movdqa A0102, W2 psllq $1, A0102 psrlq $63, W0 psllq $62, W1 por A0102, W0 C rotl 1 (A01) psrlq $2, W2 por W1, W2 C rotl 62 (A02) movdqa A0304, A0102 movdqa A0304, W1 psllq $28, A0102 psrlq $36, W1 por W1, A0102 C rotl 28 (A03) movdqa A0304, W1 psllq $27, A0304 psrlq $37, W1 por W1, A0304 C rotl 27 (A04) punpcklqdq W0, A0102 punpckhqdq W2, A0304 C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36) C 7 <- 7(6) C __ _______ C _ L' ` L_ __` C |5| |6|7| |8|9| C `-_________-^`-^ rolq $36, A05 MOVQ(A05, W0) MOVQ(A0607, A05) rolq $44, A05 C Done A05 ROTL64(6, A0607, W1) por A0607, W1 movdqa A0809, A0607 ROTL64(20, A0607, W2) por W2, A0607 punpckhqdq W1, A0607 C Done A0607 ROTL64(55, A0809, W1) por A0809, W1 movdqa W0, A0809 punpcklqdq W1, A0809 C Done 0809 C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3) C 14 <- 14(39) C _____ ___ C __L' __`_L_ `_____ C |10| |11|12| |13|14| C `-___-^`-______-^ C rolq $42, A10 C 42 + 25 = 3 (mod 64) SWAP64 A1112, W0 MOVQ(A10, A1112) MOVQ(W0, A10) rolq $43, A10 C Done A10 punpcklqdq A1314, A1112 ROTL64(25, A1112, W1) por W1, A1112 C Done A1112 ROTL64(39, A1314, W2) por A1314, W2 ROTL64(10, W0, A1314) por W0, A1314 punpckhqdq W2, A1314 C Done A1314 C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41) C 16 <- 16(45) C _____________ C / _______ C _L' ____L' | `_ C |15| |16|17| |18|19| C \ `_____-^ ^ C \_________________/ SWAP64 A1819, W0 rolq $41, A15 MOVQ(A15, W1) MOVQ(A1819, A15) rolq $21, A15 C Done A15 SWAP64 A1617, A1819 ROTL64(45, A1617, W2) por W2, A1617 ROTL64(8, W0, W3) por W3, W0 punpcklqdq W0, A1617 C Done A1617 ROTL64(15, A1819, W2) por W2, A1819 punpcklqdq W1, A1819 C Done A1819 C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18) C 23 <- 23(56) C _______________ C / \ C _L' _L'\_ ___`_ C |20| |21|22| |23|24| C \ `__ ^________-^ C \_______/ rolq $18, A20 MOVQ(A20, W0) SWAP64 A2324, W1 movd W1, A20 rolq $14, A20 C Done A20 ROTL64(56, A2324, W1) por W1, A2324 movdqa A2122, W2 ROTL64(2, W2, W1) por W1, W2 punpcklqdq W2, A2324 C Done A2324 ROTL64(61, A2122, W1) por W1, A2122 psrldq $8, A2122 punpcklqdq W0, A2122 C Done A2122 C chi step. With the transposed matrix, applied independently C to each column. movq A05, T0 notq T0 andq A10, T0 movq A10, T1 notq T1 andq A15, T1 movq A15, T2 notq T2 andq A20, T2 xorq T2, A10 movq A20, T3 notq T3 andq A00, T3 xorq T3, A15 movq A00, T2 notq T2 andq A05, T2 xorq T2, A20 xorq T0, A00 xorq T1, A05 movdqa A0607, W0 pandn A1112, W0 movdqa A1112, W1 pandn A1617, W1 movdqa A1617, W2 pandn A2122, W2 pxor W2, A1112 movdqa A2122, W3 pandn A0102, W3 pxor W3, A1617 movdqa A0102, W2 pandn A0607, W2 pxor W2, A2122 pxor W0, A0102 pxor W1, A0607 movdqa A0809, W0 pandn A1314, W0 movdqa A1314, W1 pandn A1819, W1 movdqa A1819, W2 pandn A2324, W2 pxor W2, A1314 movdqa A2324, W3 pandn A0304, W3 pxor W3, A1819 movdqa A0304, W2 pandn A0809, W2 pxor W2, A2324 pxor W0, A0304 pxor W1, A0809 xorq (RC, COUNT, 8), A00 C Transpose. C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304, C and also copy to C12 and C34 while at it. MOVQ(A05, C12) MOVQ(A15, C34) MOVQ(A10, W0) MOVQ(A20, W1) movq A00, C0 punpcklqdq W0, C12 punpcklqdq W1, C34 MOVQ(A0102, A05) MOVQ(A0304, A15) psrldq $8, A0102 psrldq $8, A0304 xorq A05, C0 xorq A15, C0 MOVQ(A0102, A10) MOVQ(A0304, A20) movdqa C12, A0102 movdqa C34, A0304 C Transpose (A0607, A1112) movdqa A0607, W0 punpcklqdq A1112, A0607 xorq A10, C0 xorq A20, C0 punpckhqdq W0, A1112 SWAP64 A1112, A1112 C Transpose (A1819, A2324) movdqa A1819, W0 punpcklqdq A2324, A1819 pxor A0607, C12 pxor A1112, C12 punpckhqdq W0, A2324 SWAP64 A2324, A2324 C Transpose (A0809, A1314) and (A1617, A2122), and swap movdqa A0809, W0 movdqa A1314, W1 movdqa A1617, A0809 movdqa A2122, A1314 pxor A1819, C34 pxor A2324, C34 punpcklqdq A2122, A0809 punpckhqdq A1617, A1314 SWAP64 A1314, A1314 movdqa W0, A1617 movdqa W1, A2122 pxor A0809, C34 pxor A1314, C34 punpcklqdq W1, A1617 punpckhqdq W0, A2122 SWAP64 A2122, A2122 decl XREG(COUNT) pxor A1617, C12 pxor A2122, C12 jnz .Loop movq A00, STATE(0) movups A0102, STATE(1) movups A0304, STATE(3) movq A05, STATE(5) movups A0607, STATE(6) movups A0809, STATE(8) movq A10, STATE(10) movups A1112, STATE(11) movups A1314, STATE(13) movq A15, STATE(15) movups A1617, STATE(16) movups A1819, STATE(18) movq A20, STATE(20) movups A2122, STATE(21) movups A2324, STATE(23) pop %r14 pop %r13 pop %r12 pop %rbp W64_EXIT(1, 16) ret EPILOGUE(nettle_sha3_permute) ALIGN(16) .rc: C In reverse order .quad 0x8000000080008008 .quad 0x0000000080000001 .quad 0x8000000000008080 .quad 0x8000000080008081 .quad 0x800000008000000A .quad 0x000000000000800A .quad 0x8000000000000080 .quad 0x8000000000008002 .quad 0x8000000000008003 .quad 0x8000000000008089 .quad 0x800000000000008B .quad 0x000000008000808B .quad 0x000000008000000A .quad 0x0000000080008009 .quad 0x0000000000000088 .quad 0x000000000000008A .quad 0x8000000000008009 .quad 0x8000000080008081 .quad 0x0000000080000001 .quad 0x000000000000808B .quad 0x8000000080008000 .quad 0x800000000000808A .quad 0x0000000000008082 .quad 0x0000000000000001