diff options
author | Niels Möller <nisse@lysator.liu.se> | 2018-02-21 23:04:44 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2018-02-21 23:04:44 +0100 |
commit | 34ea00b47003939fe1ea616f2e57067cea2c7c8f (patch) | |
tree | 8f7fc8cfe97b7a4b24a182345298496d774e62fc /x86_64 | |
parent | b908c40a2dcffb6cbab66360c3c1a6a554faac9d (diff) | |
download | nettle-34ea00b47003939fe1ea616f2e57067cea2c7c8f.tar.gz |
New sha256 implementation using sha_ni instructions.
Diffstat (limited to 'x86_64')
-rw-r--r-- | x86_64/sha_ni/sha256-compress.asm | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/x86_64/sha_ni/sha256-compress.asm b/x86_64/sha_ni/sha256-compress.asm new file mode 100644 index 00000000..f2a4bd32 --- /dev/null +++ b/x86_64/sha_ni/sha256-compress.asm @@ -0,0 +1,175 @@ +C x86_64/sha_ni/sha256-compress.asm + +ifelse(< + Copyright (C) 2018 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + + .file "sha256-compress.asm" +define(<STATE>, <%rdi>) +define(<INPUT>, <%rsi>) +define(<K>, <%rdx>) + +define(<MSGK>,<%xmm0>) C Implicit operand of sha256rnds2 +define(<MSG0>,<%xmm1>) +define(<MSG1>,<%xmm2>) +define(<MSG2>,<%xmm3>) +define(<MSG3>,<%xmm4>) +define(<ABEF>,<%xmm5>) +define(<CDGH>,<%xmm6>) +define(<ABEF_ORIG>,<%xmm7>) +define(<CDGH_ORIG>, <%xmm8>) +define(<SWAP_MASK>,<%xmm9>) +define(<TMP>, <%xmm9>) C Overlaps SWAP_MASK + +C QROUND(M0, M1, M2, M3, R) +define(<QROUND>, < + movdqa eval($5*4)(K), MSGK + paddd $1, MSGK + sha256rnds2 ABEF, CDGH + pshufd <$>0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF + movdqa $1, TMP + palignr <$>4, $4, TMP + paddd TMP, $2 + sha256msg2 $1, $2 + sha256msg1 $1, $4 + >) + +C FIXME: Do something more clever, taking the pshufd into account. +C TRANSPOSE(ABCD, EFGH, scratch) --> untouched, ABEF, CDGH +define(<TRANSPOSE>, < + movdqa $2, $3 + punpckhqdq $1, $2 + punpcklqdq $1, $3 +>) + + C void + C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) + + .text + ALIGN(16) +.Lswap_mask: + .byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12 +PROLOGUE(_nettle_sha256_compress) + W64_ENTRY(3, 10) + movups (STATE), TMP + movups 16(STATE), ABEF + + pshufd $0x1b, TMP, TMP + pshufd $0x1b, ABEF, ABEF + + TRANSPOSE(TMP, ABEF, CDGH) + + movdqa .Lswap_mask(%rip), SWAP_MASK + + movdqa ABEF, ABEF_ORIG + movdqa CDGH, CDGH_ORIG + + movups (INPUT), MSG0 + pshufb SWAP_MASK, MSG0 + + movdqa (K), MSGK + paddd MSG0, MSGK + sha256rnds2 ABEF, CDGH C Round 0-1 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 2-3 + + movups 16(INPUT), MSG1 + pshufb SWAP_MASK, MSG1 + + movdqa 16(K), MSGK + paddd MSG1, MSGK + sha256rnds2 ABEF, CDGH C Round 4-5 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 6-7 + sha256msg1 MSG1, MSG0 + + movups 32(INPUT), MSG2 + pshufb SWAP_MASK, MSG2 + + movdqa 32(K), MSGK + paddd MSG2, MSGK + sha256rnds2 ABEF, CDGH C Round 8-9 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 10-11 + sha256msg1 MSG2, MSG1 + + movups 48(INPUT), MSG3 + pshufb SWAP_MASK, MSG3 + + QROUND(MSG3, MSG0, MSG1, MSG2, 12) C Round 12-15 + QROUND(MSG0, MSG1, MSG2, MSG3, 16) + QROUND(MSG1, MSG2, MSG3, MSG0, 20) + QROUND(MSG2, MSG3, MSG0, MSG1, 24) + QROUND(MSG3, MSG0, MSG1, MSG2, 28) + QROUND(MSG0, MSG1, MSG2, MSG3, 32) + QROUND(MSG1, MSG2, MSG3, MSG0, 36) + QROUND(MSG2, MSG3, MSG0, MSG1, 40) + QROUND(MSG3, MSG0, MSG1, MSG2, 44) + QROUND(MSG0, MSG1, MSG2, MSG3, 48) + + movdqa 208(K), MSGK + paddd MSG1, MSGK + sha256rnds2 ABEF, CDGH C Round 52-53 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 54-55 + movdqa MSG1, TMP + palignr $4, MSG0, TMP + paddd TMP, MSG2 + sha256msg2 MSG1, MSG2 + + movdqa 224(K), MSGK + paddd MSG2, MSGK + sha256rnds2 ABEF, CDGH C Round 56-57 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 58-59 + movdqa MSG2, TMP + palignr $4, MSG1, TMP + paddd TMP, MSG3 + sha256msg2 MSG2, MSG3 + + movdqa 240(K), MSGK + paddd MSG3, MSGK + sha256rnds2 ABEF, CDGH C Round 60-61 + pshufd $0xe, MSGK, MSGK + sha256rnds2 CDGH, ABEF C Round 62-63 + + paddd ABEF_ORIG, ABEF + paddd CDGH_ORIG, CDGH + + TRANSPOSE(ABEF, CDGH, TMP) + + pshufd $0x1b, CDGH, CDGH + pshufd $0x1b, TMP, TMP + movups CDGH, 0(STATE) + movups TMP, 16(STATE) + + W64_EXIT(3, 10) + ret +EPILOGUE(_nettle_sha256_compress) |