diff options
-rw-r--r-- | fat-s390x.c | 16 | ||||
-rw-r--r-- | s390x/fat/sha3-permute-2.asm | 36 | ||||
-rw-r--r-- | s390x/vf/sha3-permute.asm | 383 |
3 files changed, 434 insertions, 1 deletions
diff --git a/fat-s390x.c b/fat-s390x.c index 2e4fdec5..db793e2c 100644 --- a/fat-s390x.c +++ b/fat-s390x.c @@ -264,6 +264,10 @@ DECLARE_FAT_FUNC(_nettle_sha512_compress, sha512_compress_func) DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, c) DECLARE_FAT_FUNC_VAR(sha512_compress, sha512_compress_func, s390x) +DECLARE_FAT_FUNC(nettle_sha3_permute, sha3_permute_func) +DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, c) +DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, s390x) + static void CONSTRUCTOR fat_init (void) { @@ -279,10 +283,16 @@ fat_init (void) if (verbose) fprintf (stderr, "libnettle: enabling vectorized memxor3.\n"); nettle_memxor3_vec = _nettle_memxor3_s390x; + + if (verbose) + fprintf (stderr, "libnettle: enabling vectorized sha3 permute.\n"); + nettle_sha3_permute_vec = _nettle_sha3_permute_s390x; } else { - nettle_memxor3_vec = _nettle_memxor3_c; + nettle_memxor3_vec = _nettle_memxor3_c; + + nettle_sha3_permute_vec = _nettle_sha3_permute_c; } /* AES128 */ @@ -483,3 +493,7 @@ DEFINE_FAT_FUNC(_nettle_sha256_compress, void, DEFINE_FAT_FUNC(_nettle_sha512_compress, void, (uint64_t *state, const uint8_t *input, const uint64_t *k), (state, input, k)) + +/* SHA3 */ +DEFINE_FAT_FUNC(nettle_sha3_permute, void, + (struct sha3_state *state), (state)) diff --git a/s390x/fat/sha3-permute-2.asm b/s390x/fat/sha3-permute-2.asm new file mode 100644 index 00000000..304bf7bd --- /dev/null +++ b/s390x/fat/sha3-permute-2.asm @@ -0,0 +1,36 @@ +C s390x/fat/sha3-permute-2.asm + +ifelse(` + Copyright (C) 2021 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_sha3_permute) picked up by configure + +define(`fat_transform', `_$1_s390x') +include_src(`s390x/vf/sha3-permute.asm') diff --git a/s390x/vf/sha3-permute.asm b/s390x/vf/sha3-permute.asm new file mode 100644 index 00000000..4f4a4717 --- /dev/null +++ b/s390x/vf/sha3-permute.asm @@ -0,0 +1,383 @@ +C s390x/msa_x6/sha3-permute.asm + +ifelse(` + Copyright (C) 2012 Niels Möller + Copyright (C) 2021 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +define(`STATE', `%r2') C 25 64-bit values, 200 bytes. + +define(`COUNT', `%r3') + +define(`A00', `%r0') +define(`A0102', `%v0') +define(`A0304', `%v1') + +define(`A05', `%r4') +define(`A0607', `%v2') +define(`A0809', `%v3') + +define(`A10', `%r5') +define(`A1112', `%v4') +define(`A1314', `%v5') + +define(`A15', `%r6') +define(`A1617', `%v6') +define(`A1819', `%v7') + +define(`A20', `%r7') +define(`A2122', `%v8') +define(`A2324', `%v9') + +define(`C0', `%r8') +define(`C12', `%v24') +define(`C34', `%v25') + +define(`D0', `%r9') +define(`D12', `%v26') +define(`D34', `%v27') + +C Wide temporaries +define(`W0', `%v28') +define(`W1', `%v29') +define(`W2', `%v30') C Overlap D12 +define(`W3', `%v31') C Overlap D34 + +define(`TMP', `%r9') + +define(`T0', `%r10') +define(`T1', `%r11') +define(`T2', `%r12') C Overlap D0 +define(`T3', `%r13') C Overlap C0 + +define(`RC', `%r14') + +.file "sha3-permute.asm" + +.text + +C void +C sha3_permute(struct sha3_ctx *ctx) + +PROLOGUE(nettle_sha3_permute) + stmg %r6,%r14,48(SP) + ALLOC_STACK(%r1,16) + std %f8,0(%r1) + std %f9,8(%r1) + + lghi COUNT,24*8 + larl RC,.rc + aghi RC,-8 + + lg A00,0*8(STATE) + vl A0102,1*8(STATE) + vl A0304,3*8(STATE) + lgr C0,A00 + + lg A05,5*8(STATE) + vl A0607,6*8(STATE) + vl A0809,8*8(STATE) + xgr C0,A05 + vx C12,A0102,A0607 + vx C34,A0304,A0809 + + lg A10,10*8(STATE) + vl A1112,11*8(STATE) + vl A1314,13*8(STATE) + xgr C0,A10 + vx C12,C12,A1112 + vx C34,C34,A1314 + + lg A15,15*8(STATE) + vl A1617,16*8(STATE) + vl A1819,18*8(STATE) + xgr C0,A15 + vx C12,C12,A1617 + vx C34,C34,A1819 + + lg A20,20*8(STATE) + vl A2122,21*8(STATE) + vl A2324,23*8(STATE) + xgr C0,A20 + vx C12,C12,A2122 + vx C34,C34,A2324 + + j .Loop + +.align 16 +.Loop: + vlvgg D12,C0,0 + vmrhg D12,D12,C12 C Holds C0, C1 + vpdi D34,C12,C34,0b0100 C Holds C2, C3 + vpdi C34,C34,D12,0b0100 C Holds C4, C0 + vlgvg D0,C34,0 + vlgvg T0,C12,0 + rllg T0,T0,1 + xgr D0,T0 + + C Can use C12 as temporary + veslg W0,D34,1 + vesrlg W1,D34,63 + vx D12,D12,W0 + vx D12,D12,W1 C Done D12 + + veslg C12,C34,1 + vesrlg C34,C34,63 + vx D34,D34,C34 + vx D34,D34,C12 C Done D34 + + xgr A00,D0 + xgr A05,D0 + xgr A10,D0 + xgr A15,D0 + xgr A20,D0 + vx A0102,A0102,D12 + vx A0607,A0607,D12 + vx A1112,A1112,D12 + vx A1617,A1617,D12 + vx A2122,A2122,D12 + vx A0304,A0304,D34 + vx A0809,A0809,D34 + vx A1314,A1314,D34 + vx A1819,A1819,D34 + vx A2324,A2324,D34 + + C Do the 1,2,3,4 row. First rotate, then permute. + vesrlg W0,A0102,63 + veslg W1,A0102,62 + vesrlg W2,A0102,2 + veslg A0102,A0102,1 + vo W0,W0,A0102 C veslg 1 (A01) + vo W2,W2,W1 C veslg 62 (A02) + + veslg A0102,A0304,28 + vesrlg W1,A0304,36 + vo A0102,A0102,W1 C veslg 28 (A03) + vesrlg W1,A0304,37 + veslg A0304,A0304,27 + vo A0304,A0304,W1 C veslg 27 (A04) + + vmrhg A0102,A0102,W0 + vmrlg A0304,A0304,W2 + + rllg A05,A05,36 + vlvgg W0,A05,0 + vlgvg A05,A0607,0 + rllg A05,A05,44 C Done A05 + verllg W1,A0607,6 + verllg A0607,A0809,20 + vmrlg A0607,A0607,W1 C Done A0607 + verllg W1,A0809,55 + vmrhg A0809,W0,W1 C Done A0809 + + rllg A10,A10,42 C 42 + 25 = 3 (mod 64) + verllg W0,A1112,10 + vlvgg A1112,A10,0 + vlgvg A10,A1112,1 + rllg A10,A10,43 C Done A10 + + vmrhg A1112,A1112,A1314 + verllg A1112,A1112,25 C Done A1112 + verllg W2,A1314,39 + vpdi A1314,W0,W2,0b0001 C Done A1314 + + verllg W0,A1819,8 + rllg A15,A15,41 + vlvgg W1,A15,1 + vlgvg A15,A1819,0 + rllg A15,A15,21 C Done A15 + verllg A1819,A1617,15 + verllg A1617,A1617,45 + vpdi A1617,A1617,W0,0b0001 C Done A1617 + vmrlg A1819,A1819,W1 C Done A1819 + + rllg A20,A20,18 + vlvgg W0,A20,1 + vlgvg A20,A2324,1 + rllg A20,A20,14 C Done A20 + verllg A2324,A2324,56 + + verllg W2,A2122,2 + vmrhg A2324,A2324,W2 C Done A2324 + + verllg A2122,A2122,61 + vmrlg A2122,A2122,W0 C Done A2122 + + C chi step. With the transposed matrix, applied independently + C to each column. + lghi TMP,-1 + lgr T0,A05 + xgr T0,TMP + ngr T0,A10 + lgr T1,A10 + xgr T1,TMP + ngr T1,A15 + lgr T2,A15 + xgr T2,TMP + ngr T2,A20 + xgr A10,T2 + lgr T3,A20 + xgr T3,TMP + ngr T3,A00 + xgr A15,T3 + lgr T2,A00 + xgr T2,TMP + ngr T2,A05 + xgr A20,T2 + xgr A00,T0 + xgr A05,T1 + + vnc W0,A1112,A0607 + vnc W1,A1617,A1112 + vnc W2,A2122,A1617 + vx A1112,A1112,W2 + vnc W3,A0102,A2122 + vx A1617,A1617,W3 + vnc W2,A0607,A0102 + vx A2122,A2122,W2 + vx A0102,A0102,W0 + vx A0607,A0607,W1 + + vnc W0,A1314,A0809 + vnc W1,A1819,A1314 + vnc W2,A2324,A1819 + vx A1314,A1314,W2 + vnc W3,A0304,A2324 + vx A1819,A1819,W3 + vnc W2,A0809,A0304 + vx A2324,A2324,W2 + vx A0304,A0304,W0 + vx A0809,A0809,W1 + + lg TMP,0(COUNT,RC) + xgr A00,TMP + + C Transpose. + C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304, + C and also copy to C12 and C34 while at it. + + vlvgg C12,A05,0 + vlvgg C34,A15,0 + vlvgg W0,A10,0 + vlvgg W1,A20,0 + lgr C0,A00 + vlgvg A05,A0102,0 + vlgvg A15,A0304,0 + xgr C0,A05 + xgr C0,A15 + vlgvg A10,A0102,1 + vlgvg A20,A0304,1 + vmrhg A0102,C12,W0 + vmrhg A0304,C34,W1 + + C Transpose (A0607, A1112) + vlr W0,A0607 + vmrhg A0607,A0607,A1112 + xgr C0,A10 + xgr C0,A20 + vmrlg A1112,W0,A1112 + + C Transpose (A1819, A2324) + vlr W0,A1819 + vmrhg A1819,A1819,A2324 + vx C12,A0102,A0607 + vx C12,C12,A1112 + vmrlg A2324,W0,A2324 + + C Transpose (A0809, A1314) and (A1617, A2122), and swap + vlr W0,A0809 + vlr W1,A1314 + vx C34,A0304,A1819 + vx C34,C34,A2324 + vmrhg A0809,A1617,A2122 + vmrlg A1314,A1617,A2122 + vx C34,C34,A0809 + vx C34,C34,A1314 + vmrhg A1617,W0,W1 + vmrlg A2122,W0,W1 + + ahi COUNT,-8 + vx C12,C12,A1617 + vx C12,C12,A2122 + clijne COUNT,0,.Loop + + stg A00,0*8(STATE) + vst A0102,1*8(STATE) + vst A0304,3*8(STATE) + + stg A05,5*8(STATE) + vst A0607,6*8(STATE) + vst A0809,8*8(STATE) + + stg A10,10*8(STATE) + vst A1112,11*8(STATE) + vst A1314,13*8(STATE) + + stg A15,15*8(STATE) + vst A1617,16*8(STATE) + vst A1819,18*8(STATE) + + stg A20,20*8(STATE) + vst A2122,21*8(STATE) + vst A2324,23*8(STATE) + + ld %f8,0(%r1) + ld %f9,8(%r1) + FREE_STACK(16) + lmg %r6,%r14,48(SP) + + br RA +EPILOGUE(nettle_sha3_permute) + +.align 16 +.rc: C In reverse order + .quad 0x8000000080008008 + .quad 0x0000000080000001 + .quad 0x8000000000008080 + .quad 0x8000000080008081 + .quad 0x800000008000000A + .quad 0x000000000000800A + .quad 0x8000000000000080 + .quad 0x8000000000008002 + .quad 0x8000000000008003 + .quad 0x8000000000008089 + .quad 0x800000000000008B + .quad 0x000000008000808B + .quad 0x000000008000000A + .quad 0x0000000080008009 + .quad 0x0000000000000088 + .quad 0x000000000000008A + .quad 0x8000000000008009 + .quad 0x8000000080008081 + .quad 0x0000000080000001 + .quad 0x000000000000808B + .quad 0x8000000080008000 + .quad 0x800000000000808A + .quad 0x0000000000008082 + .quad 0x0000000000000001 +.size .rc,.-.rc |