From 34ea00b47003939fe1ea616f2e57067cea2c7c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Wed, 21 Feb 2018 23:04:44 +0100 Subject: New sha256 implementation using sha_ni instructions. --- x86_64/sha_ni/sha256-compress.asm | 175 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 x86_64/sha_ni/sha256-compress.asm (limited to 'x86_64') diff --git a/x86_64/sha_ni/sha256-compress.asm b/x86_64/sha_ni/sha256-compress.asm new file mode 100644 index 00000000..f2a4bd32 --- /dev/null +++ b/x86_64/sha_ni/sha256-compress.asm @@ -0,0 +1,175 @@ +C x86_64/sha_ni/sha256-compress.asm + +ifelse(< + Copyright (C) 2018 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + + .file "sha256-compress.asm" +define(, <%rdi>) +define(, <%rsi>) +define(, <%rdx>) + +define(,<%xmm0>) C Implicit operand of sha256rnds2 +define(,<%xmm1>) +define(,<%xmm2>) +define(,<%xmm3>) +define(,<%xmm4>) +define(,<%xmm5>) +define(,<%xmm6>) +define(,<%xmm7>) +define(, <%xmm8>) +define(,<%xmm9>) +define(, <%xmm9>) C Overlaps SWAP_MASK + +C QROUND(M0, M1, M2, M3, R) +define(, < + movdqa eval($5*4)(K), MSGK + paddd $1, MSGK + sha256rnds2 ABEF, CDGH + pshufd <$>0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF + movdqa $1, TMP + palignr <$>4, $4, TMP + paddd TMP, $2 + sha256msg2 $1, $2 + sha256msg1 $1, $4 + >) + +C FIXME: Do something more clever, taking the pshufd into account. +C TRANSPOSE(ABCD, EFGH, scratch) --> untouched, ABEF, CDGH +define(, < + movdqa $2, $3 + punpckhqdq $1, $2 + punpcklqdq $1, $3 +>) + + C void + C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) + + .text + ALIGN(16) +.Lswap_mask: + .byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12 +PROLOGUE(_nettle_sha256_compress) + W64_ENTRY(3, 10) + movups (STATE), TMP + movups 16(STATE), ABEF + + pshufd $0x1b, TMP, TMP + pshufd $0x1b, ABEF, ABEF + + TRANSPOSE(TMP, ABEF, CDGH) + + movdqa .Lswap_mask(%rip), SWAP_MASK + + movdqa ABEF, ABEF_ORIG + movdqa CDGH, CDGH_ORIG + + movups (INPUT), MSG0 + pshufb SWAP_MASK, MSG0 + + movdqa (K), MSGK + paddd MSG0, MSGK + sha256rnds2 ABEF, CDGH C Round 0-1 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 2-3 + + movups 16(INPUT), MSG1 + pshufb SWAP_MASK, MSG1 + + movdqa 16(K), MSGK + paddd MSG1, MSGK + sha256rnds2 ABEF, CDGH C Round 4-5 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 6-7 + sha256msg1 MSG1, MSG0 + + movups 32(INPUT), MSG2 + pshufb SWAP_MASK, MSG2 + + movdqa 32(K), MSGK + paddd MSG2, MSGK + sha256rnds2 ABEF, CDGH C Round 8-9 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 10-11 + sha256msg1 MSG2, MSG1 + + movups 48(INPUT), MSG3 + pshufb SWAP_MASK, MSG3 + + QROUND(MSG3, MSG0, MSG1, MSG2, 12) C Round 12-15 + QROUND(MSG0, MSG1, MSG2, MSG3, 16) + QROUND(MSG1, MSG2, MSG3, MSG0, 20) + QROUND(MSG2, MSG3, MSG0, MSG1, 24) + QROUND(MSG3, MSG0, MSG1, MSG2, 28) + QROUND(MSG0, MSG1, MSG2, MSG3, 32) + QROUND(MSG1, MSG2, MSG3, MSG0, 36) + QROUND(MSG2, MSG3, MSG0, MSG1, 40) + QROUND(MSG3, MSG0, MSG1, MSG2, 44) + QROUND(MSG0, MSG1, MSG2, MSG3, 48) + + movdqa 208(K), MSGK + paddd MSG1, MSGK + sha256rnds2 ABEF, CDGH C Round 52-53 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 54-55 + movdqa MSG1, TMP + palignr $4, MSG0, TMP + paddd TMP, MSG2 + sha256msg2 MSG1, MSG2 + + movdqa 224(K), MSGK + paddd MSG2, MSGK + sha256rnds2 ABEF, CDGH C Round 56-57 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 58-59 + movdqa MSG2, TMP + palignr $4, MSG1, TMP + paddd TMP, MSG3 + sha256msg2 MSG2, MSG3 + + movdqa 240(K), MSGK + paddd MSG3, MSGK + sha256rnds2 ABEF, CDGH C Round 60-61 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 62-63 + + paddd ABEF_ORIG, ABEF + paddd CDGH_ORIG, CDGH + + TRANSPOSE(ABEF, CDGH, TMP) + + pshufd $0x1b, CDGH, CDGH + pshufd $0x1b, TMP, TMP + movups CDGH, 0(STATE) + movups TMP, 16(STATE) + + W64_EXIT(3, 10) + ret +EPILOGUE(_nettle_sha256_compress) -- cgit v1.2.1