diff options
author | Mamone Tarsha <maamoun.tk@googlemail.com> | 2021-10-24 20:39:11 +0200 |
---|---|---|
committer | Mamone Tarsha <maamoun.tk@googlemail.com> | 2021-10-24 20:39:11 +0200 |
commit | 571d2cc2895d587001a14d951ba317e81e9e3bfd (patch) | |
tree | d8b24caa130936e717b5c78aa1bd94eadbabee7b /s390x | |
parent | 259ec19afbb3645d796aa31014416848b80912b2 (diff) | |
download | nettle-571d2cc2895d587001a14d951ba317e81e9e3bfd.tar.gz |
[S390x] Improvements on documentation and instruction set usage for SHA3 permute
Diffstat (limited to 's390x')
-rw-r--r-- | s390x/vf/sha3-permute.asm | 106 |
1 files changed, 71 insertions, 35 deletions
diff --git a/s390x/vf/sha3-permute.asm b/s390x/vf/sha3-permute.asm index 517ce894..d656b97c 100644 --- a/s390x/vf/sha3-permute.asm +++ b/s390x/vf/sha3-permute.asm @@ -85,15 +85,17 @@ C void C sha3_permute(struct sha3_ctx *ctx) PROLOGUE(nettle_sha3_permute) - stmg %r6,%r14,48(SP) - ALLOC_STACK(%r1,16) + stmg %r6,%r14,48(SP) C Save non-volatile general registers + ALLOC_STACK(%r1,16) C Allocate 16-byte space on stack + C Save non-volatile floating point registers std %f8,0(%r1) std %f9,8(%r1) lghi COUNT,24*8 - larl RC,.rc + larl RC,.rc C Load address of rc data aghi RC,-8 + C Load state data lg A00,0*8(STATE) vl A0102,1*8(STATE) vl A0304,3*8(STATE) @@ -130,25 +132,31 @@ PROLOGUE(nettle_sha3_permute) .align 16 .Loop: + C The theta step. Combine parity bits, then xor to state. + C D0 = C4 ^ (C1 <<< 1) + C D1 = C0 ^ (C2 <<< 1) + C D2 = C1 ^ (C3 <<< 1) + C D3 = C2 ^ (C4 <<< 1) + C D4 = C3 ^ (C0 <<< 1) + + C Shift the words around, putting (C0, C1) in D12, (C2, C3) in + C D34, and (C4, C0) in C34. + vlvgg D12,C0,0 vmrhg D12,D12,C12 C Holds C0, C1 vpdi D34,C12,C34,0b0100 C Holds C2, C3 vpdi C34,C34,D12,0b0100 C Holds C4, C0 - vlgvg D0,C34,0 + vlgvg T0,C12,0 + vlgvg D0,C34,0 rllg T0,T0,1 xgr D0,T0 - C Can use C12 as temporary - veslg W0,D34,1 - vesrlg W1,D34,63 - vx D12,D12,W0 - vx D12,D12,W1 C Done D12 + verllg W0,D34,1 + vx D12,D12,W0 C Done D12 - veslg C12,C34,1 - vesrlg C34,C34,63 - vx D34,D34,C34 - vx D34,D34,C12 C Done D34 + verllg W1,C34,1 + vx D34,D34,W1 C Done D34 xgr A00,D0 xgr A05,D0 @@ -166,24 +174,37 @@ PROLOGUE(nettle_sha3_permute) vx A1819,A1819,D34 vx A2324,A2324,D34 - C Do the 1,2,3,4 row. First rotate, then permute. - vesrlg W0,A0102,63 - veslg W1,A0102,62 - vesrlg W2,A0102,2 - veslg A0102,A0102,1 - vo W0,W0,A0102 C veslg 1 (A01) - vo W2,W2,W1 C veslg 62 (A02) - - veslg A0102,A0304,28 - vesrlg W1,A0304,36 - vo A0102,A0102,W1 C veslg 28 (A03) - vesrlg W1,A0304,37 - veslg A0304,A0304,27 - vo A0304,A0304,W1 C veslg 27 (A04) + C theta step done, no C, D or W temporaries alive. + + C rho step. When doing the permutations, also + C transpose the rows of matrix into temporary + C coordinates to assist the chi step. + C Defer pi step to the last phase. + + C The combined permutation + transpose gives the following + C cycles (rotation counts in parenthesis) + C 0 <- 0(0) + C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1) + C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36) + C 7 <- 7(6) + C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3) + C 14 <- 14(39) + C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41) + C 16 <- 16(45) + C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18) + C 23 <- 23(56) + + C Do the 1,2,3,4 row. First rotate (permute), then transpose. + verllg W0,A0102,1 C verllg 1 (A01) + verllg W2,A0102,62 C verllg 62 (A02) + + verllg A0102,A0304,28 C verllg 28 (A03) + verllg A0304,A0304,27 C verllg 27 (A04) vmrhg A0102,A0102,W0 vmrlg A0304,A0304,W2 + C Do the 5,6,7,8,9 row. rllg A05,A05,36 vlvgg W0,A05,0 vlgvg A05,A0607,0 @@ -194,17 +215,19 @@ PROLOGUE(nettle_sha3_permute) verllg W1,A0809,55 vmrhg A0809,W0,W1 C Done A0809 - rllg A10,A10,42 C 42 + 25 = 3 (mod 64) + C Do the 10,11,12,13,14 row. + C Roatated using verllg with (25) later. 42 + 25 = 3 (mod 64) + rllg A10,A10,42 verllg W0,A1112,10 vlvgg A1112,A10,0 vlgvg A10,A1112,1 rllg A10,A10,43 C Done A10 - vmrhg A1112,A1112,A1314 verllg A1112,A1112,25 C Done A1112 verllg W2,A1314,39 vpdi A1314,W0,W2,0b0001 C Done A1314 + C Do the 15,16,17,18,19 row. verllg W0,A1819,8 rllg A15,A15,41 vlvgg W1,A15,1 @@ -215,15 +238,14 @@ PROLOGUE(nettle_sha3_permute) vpdi A1617,A1617,W0,0b0001 C Done A1617 vmrlg A1819,A1819,W1 C Done A1819 + C Do the 20,21,22,23,24 row. rllg A20,A20,18 vlvgg W0,A20,1 vlgvg A20,A2324,1 rllg A20,A20,14 C Done A20 verllg A2324,A2324,56 - verllg W2,A2122,2 vmrhg A2324,A2324,W2 C Done A2324 - verllg A2122,A2122,61 vmrlg A2122,A2122,W0 C Done A2122 @@ -268,13 +290,25 @@ PROLOGUE(nettle_sha3_permute) vx A0304,A0304,W0 vx A0809,A0809,W1 + C iota step. lg TMP,0(COUNT,RC) xgr A00,TMP - C Transpose. + C Deferred pi step. Transpose the matrix from the temporary + C positions. The transpose gives the matrix with the + C following (x,y) coordinates. + C (0,0) <- (0,0), (0,2) <- (2,0), (0,4) <- (4,0) + C (0,3) <- (3,0), (0,1) <- (1,0), (1,3) <- (3,1) + C (1,0) <- (0,1), (1,2) <- (2,1), (1,4) <- (4,1) + C (1,1) <- (1,1), (2,1) <- (1,2), (2,3) <- (3,2) + C (2,0) <- (0,2), (2,2) <- (2,2), (2,4) <- (4,2) + C (3,4) <- (4,3), (3,1) <- (1,3), (3,3) <- (3,3) + C (3,0) <- (0,3), (3,2) <- (2,3), (4,2) <- (2,4) + C (4,4) <- (4,4), (4,1) <- (1,4), (4,3) <- (3,4) + C (4,0) <- (0,4) + C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304, C and also copy to C12 and C34 while at it. - vlvgg C12,A05,0 vlvgg C34,A15,0 vlvgg W0,A10,0 @@ -319,6 +353,7 @@ PROLOGUE(nettle_sha3_permute) vx C12,C12,A2122 clijne COUNT,0,.Loop + C Save state data stg A00,0*8(STATE) vst A0102,1*8(STATE) vst A0304,3*8(STATE) @@ -339,10 +374,11 @@ PROLOGUE(nettle_sha3_permute) vst A2122,21*8(STATE) vst A2324,23*8(STATE) + C Load non-volatile floating point registers ld %f8,0(%r1) ld %f9,8(%r1) - FREE_STACK(16) - lmg %r6,%r14,48(SP) + FREE_STACK(16) C Deallocate stack space + lmg %r6,%r14,48(SP) C Load non-volatile general registers br RA EPILOGUE(nettle_sha3_permute) |