summaryrefslogtreecommitdiff
path: root/s390x
diff options
context:
space:
mode:
authorMamone Tarsha <maamoun.tk@googlemail.com>2021-10-24 20:39:11 +0200
committerMamone Tarsha <maamoun.tk@googlemail.com>2021-10-24 20:39:11 +0200
commit571d2cc2895d587001a14d951ba317e81e9e3bfd (patch)
treed8b24caa130936e717b5c78aa1bd94eadbabee7b /s390x
parent259ec19afbb3645d796aa31014416848b80912b2 (diff)
downloadnettle-571d2cc2895d587001a14d951ba317e81e9e3bfd.tar.gz
[S390x] Improvements on documentation and instruction set usage for SHA3 permute
Diffstat (limited to 's390x')
-rw-r--r--s390x/vf/sha3-permute.asm106
1 files changed, 71 insertions, 35 deletions
diff --git a/s390x/vf/sha3-permute.asm b/s390x/vf/sha3-permute.asm
index 517ce894..d656b97c 100644
--- a/s390x/vf/sha3-permute.asm
+++ b/s390x/vf/sha3-permute.asm
@@ -85,15 +85,17 @@ C void
C sha3_permute(struct sha3_ctx *ctx)
PROLOGUE(nettle_sha3_permute)
- stmg %r6,%r14,48(SP)
- ALLOC_STACK(%r1,16)
+ stmg %r6,%r14,48(SP) C Save non-volatile general registers
+ ALLOC_STACK(%r1,16) C Allocate 16-byte space on stack
+ C Save non-volatile floating point registers
std %f8,0(%r1)
std %f9,8(%r1)
lghi COUNT,24*8
- larl RC,.rc
+ larl RC,.rc C Load address of rc data
aghi RC,-8
+ C Load state data
lg A00,0*8(STATE)
vl A0102,1*8(STATE)
vl A0304,3*8(STATE)
@@ -130,25 +132,31 @@ PROLOGUE(nettle_sha3_permute)
.align 16
.Loop:
+ C The theta step. Combine parity bits, then xor to state.
+ C D0 = C4 ^ (C1 <<< 1)
+ C D1 = C0 ^ (C2 <<< 1)
+ C D2 = C1 ^ (C3 <<< 1)
+ C D3 = C2 ^ (C4 <<< 1)
+ C D4 = C3 ^ (C0 <<< 1)
+
+ C Shift the words around, putting (C0, C1) in D12, (C2, C3) in
+ C D34, and (C4, C0) in C34.
+
vlvgg D12,C0,0
vmrhg D12,D12,C12 C Holds C0, C1
vpdi D34,C12,C34,0b0100 C Holds C2, C3
vpdi C34,C34,D12,0b0100 C Holds C4, C0
- vlgvg D0,C34,0
+
vlgvg T0,C12,0
+ vlgvg D0,C34,0
rllg T0,T0,1
xgr D0,T0
- C Can use C12 as temporary
- veslg W0,D34,1
- vesrlg W1,D34,63
- vx D12,D12,W0
- vx D12,D12,W1 C Done D12
+ verllg W0,D34,1
+ vx D12,D12,W0 C Done D12
- veslg C12,C34,1
- vesrlg C34,C34,63
- vx D34,D34,C34
- vx D34,D34,C12 C Done D34
+ verllg W1,C34,1
+ vx D34,D34,W1 C Done D34
xgr A00,D0
xgr A05,D0
@@ -166,24 +174,37 @@ PROLOGUE(nettle_sha3_permute)
vx A1819,A1819,D34
vx A2324,A2324,D34
- C Do the 1,2,3,4 row. First rotate, then permute.
- vesrlg W0,A0102,63
- veslg W1,A0102,62
- vesrlg W2,A0102,2
- veslg A0102,A0102,1
- vo W0,W0,A0102 C veslg 1 (A01)
- vo W2,W2,W1 C veslg 62 (A02)
-
- veslg A0102,A0304,28
- vesrlg W1,A0304,36
- vo A0102,A0102,W1 C veslg 28 (A03)
- vesrlg W1,A0304,37
- veslg A0304,A0304,27
- vo A0304,A0304,W1 C veslg 27 (A04)
+ C theta step done, no C, D or W temporaries alive.
+
+ C rho step. When doing the permutations, also
+ C transpose the rows of matrix into temporary
+ C coordinates to assist the chi step.
+ C Defer pi step to the last phase.
+
+ C The combined permutation + transpose gives the following
+ C cycles (rotation counts in parenthesis)
+ C 0 <- 0(0)
+ C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
+ C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
+ C 7 <- 7(6)
+ C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
+ C 14 <- 14(39)
+ C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
+ C 16 <- 16(45)
+ C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
+ C 23 <- 23(56)
+
+ C Do the 1,2,3,4 row. First rotate (permute), then transpose.
+ verllg W0,A0102,1 C verllg 1 (A01)
+ verllg W2,A0102,62 C verllg 62 (A02)
+
+ verllg A0102,A0304,28 C verllg 28 (A03)
+ verllg A0304,A0304,27 C verllg 27 (A04)
vmrhg A0102,A0102,W0
vmrlg A0304,A0304,W2
+ C Do the 5,6,7,8,9 row.
rllg A05,A05,36
vlvgg W0,A05,0
vlgvg A05,A0607,0
@@ -194,17 +215,19 @@ PROLOGUE(nettle_sha3_permute)
verllg W1,A0809,55
vmrhg A0809,W0,W1 C Done A0809
- rllg A10,A10,42 C 42 + 25 = 3 (mod 64)
+ C Do the 10,11,12,13,14 row.
+ C Roatated using verllg with (25) later. 42 + 25 = 3 (mod 64)
+ rllg A10,A10,42
verllg W0,A1112,10
vlvgg A1112,A10,0
vlgvg A10,A1112,1
rllg A10,A10,43 C Done A10
-
vmrhg A1112,A1112,A1314
verllg A1112,A1112,25 C Done A1112
verllg W2,A1314,39
vpdi A1314,W0,W2,0b0001 C Done A1314
+ C Do the 15,16,17,18,19 row.
verllg W0,A1819,8
rllg A15,A15,41
vlvgg W1,A15,1
@@ -215,15 +238,14 @@ PROLOGUE(nettle_sha3_permute)
vpdi A1617,A1617,W0,0b0001 C Done A1617
vmrlg A1819,A1819,W1 C Done A1819
+ C Do the 20,21,22,23,24 row.
rllg A20,A20,18
vlvgg W0,A20,1
vlgvg A20,A2324,1
rllg A20,A20,14 C Done A20
verllg A2324,A2324,56
-
verllg W2,A2122,2
vmrhg A2324,A2324,W2 C Done A2324
-
verllg A2122,A2122,61
vmrlg A2122,A2122,W0 C Done A2122
@@ -268,13 +290,25 @@ PROLOGUE(nettle_sha3_permute)
vx A0304,A0304,W0
vx A0809,A0809,W1
+ C iota step.
lg TMP,0(COUNT,RC)
xgr A00,TMP
- C Transpose.
+ C Deferred pi step. Transpose the matrix from the temporary
+ C positions. The transpose gives the matrix with the
+ C following (x,y) coordinates.
+ C (0,0) <- (0,0), (0,2) <- (2,0), (0,4) <- (4,0)
+ C (0,3) <- (3,0), (0,1) <- (1,0), (1,3) <- (3,1)
+ C (1,0) <- (0,1), (1,2) <- (2,1), (1,4) <- (4,1)
+ C (1,1) <- (1,1), (2,1) <- (1,2), (2,3) <- (3,2)
+ C (2,0) <- (0,2), (2,2) <- (2,2), (2,4) <- (4,2)
+ C (3,4) <- (4,3), (3,1) <- (1,3), (3,3) <- (3,3)
+ C (3,0) <- (0,3), (3,2) <- (2,3), (4,2) <- (2,4)
+ C (4,4) <- (4,4), (4,1) <- (1,4), (4,3) <- (3,4)
+ C (4,0) <- (0,4)
+
C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304,
C and also copy to C12 and C34 while at it.
-
vlvgg C12,A05,0
vlvgg C34,A15,0
vlvgg W0,A10,0
@@ -319,6 +353,7 @@ PROLOGUE(nettle_sha3_permute)
vx C12,C12,A2122
clijne COUNT,0,.Loop
+ C Save state data
stg A00,0*8(STATE)
vst A0102,1*8(STATE)
vst A0304,3*8(STATE)
@@ -339,10 +374,11 @@ PROLOGUE(nettle_sha3_permute)
vst A2122,21*8(STATE)
vst A2324,23*8(STATE)
+ C Load non-volatile floating point registers
ld %f8,0(%r1)
ld %f9,8(%r1)
- FREE_STACK(16)
- lmg %r6,%r14,48(SP)
+ FREE_STACK(16) C Deallocate stack space
+ lmg %r6,%r14,48(SP) C Load non-volatile general registers
br RA
EPILOGUE(nettle_sha3_permute)