summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaamoun TK <maamoun.tk@googlemail.com>2022-11-06 08:00:38 +0200
committerMaamoun TK <maamoun.tk@googlemail.com>2022-11-06 08:00:38 +0200
commit2d9f46878d4c6de044e6e0e4820d681e62851283 (patch)
tree4de683092fb2660ef2d3682229278d957a229558
parentaec77fd3b29663ca5b2b7094b3b590e4262da6b4 (diff)
downloadnettle-2d9f46878d4c6de044e6e0e4820d681e62851283.tar.gz
[PowerPC] Move register allocation from poly1305.m4
-rw-r--r--powerpc64/p9/poly1305-blocks.asm50
-rw-r--r--powerpc64/p9/poly1305-internal.asm27
-rw-r--r--powerpc64/p9/poly1305.m4105
3 files changed, 92 insertions, 90 deletions
diff --git a/powerpc64/p9/poly1305-blocks.asm b/powerpc64/p9/poly1305-blocks.asm
index 3f729e98..cbd03505 100644
--- a/powerpc64/p9/poly1305-blocks.asm
+++ b/powerpc64/p9/poly1305-blocks.asm
@@ -37,15 +37,12 @@ C Register usage:
define(`SP', `r1')
define(`TOCP', `r2')
-define(`DEFINES_BLOCK_ARG_R64', `
- C State inputs
- define(`H0', `r6')
- define(`H1', `r7')
- define(`H2', `r8')
- C State outputs
- define(`F0', `v1')
- define(`F1', `v2')
- ')
+C Argments
+define(`CTX', `r3')
+define(`BLOCKS', `r4')
+define(`DATA', `r5')
+
+define(`PADBYTE', `r6') C Padding byte register
define(`DEFINES_BLOCK_R44', `
define(`R0', `v0')
@@ -203,17 +200,15 @@ PROLOGUE(_nettle_poly1305_blocks)
stxv VSR(v21),-192(SP)
stxv VSR(v20),-208(SP)
- mr LEN, r4
- mr DATA, r5
C Initialize padding byte register
li PADBYTE, 1
C Process data blocks of number of multiple 4
DEFINES_BLOCK_R44()
- cmpldi LEN, POLY1305_BLOCK_THRESHOLD
+ cmpldi BLOCKS, POLY1305_BLOCK_THRESHOLD
blt Ldata_r64
- srdi r9, LEN, 2
- andi. LEN, LEN, 3
+ srdi r9, BLOCKS, 2
+ andi. BLOCKS, BLOCKS, 3
mtctr r9
C Initialize constants
@@ -384,24 +379,23 @@ IF_BE(`
stxsd H2, 48(CTX)
Ldata_r64:
- DEFINES_BLOCK_ARG_R64()
- C COUNTER = LEN / 16
- cmpldi LEN, 0
+ cmpldi BLOCKS, 0
beq Ldone
- mtctr LEN
- ld H0, P1305_H0 (CTX)
- ld H1, P1305_H1 (CTX)
- ld H2, P1305_H2 (CTX)
+ mtctr BLOCKS
+ mr r4, PADBYTE
+ ld r6, P1305_H0 (CTX)
+ ld r7, P1305_H1 (CTX)
+ ld r8, P1305_H2 (CTX)
L1B_loop:
- BLOCK_R64(F0,F1,H0,H1,H2)
- mfvsrld H0, VSR(F0)
- mfvsrld H1, VSR(F1)
- mfvsrd H2, VSR(F1)
+ BLOCK_R64(CTX,DATA,r4,r6,v0)
+ mfvsrld r6, VSR(v0)
+ mfvsrld r7, VSR(v1)
+ mfvsrd r8, VSR(v1)
addi DATA, DATA, 16
bdnz L1B_loop
- std H0, P1305_H0 (CTX)
- std H1, P1305_H1 (CTX)
- std H2, P1305_H2 (CTX)
+ std r6, P1305_H0 (CTX)
+ std r7, P1305_H1 (CTX)
+ std r8, P1305_H2 (CTX)
Ldone:
C Restore non-volatile vector registers
diff --git a/powerpc64/p9/poly1305-internal.asm b/powerpc64/p9/poly1305-internal.asm
index a1e46e8f..c23e16fd 100644
--- a/powerpc64/p9/poly1305-internal.asm
+++ b/powerpc64/p9/poly1305-internal.asm
@@ -37,13 +37,10 @@ C Register usage:
define(`SP', `r1')
define(`TOCP', `r2')
-C State inputs
-define(`H0', `r6')
-define(`H1', `r7')
-define(`H2', `r8')
-C State outputs
-define(`F0', `v1')
-define(`F1', `v2')
+C Argments
+define(`CTX', `r3')
+define(`DATA', `r4')
+define(`PADBYTE', `r5') C Padding byte register
.text
@@ -91,17 +88,17 @@ EPILOGUE(_nettle_poly1305_set_key)
C void _nettle_poly1305_block(struct poly1305_ctx *ctx, const uint8_t *m, unsigned m128)
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_poly1305_block)
- ld H0, P1305_H0 (CTX)
- ld H1, P1305_H1 (CTX)
- ld H2, P1305_H2 (CTX)
+ ld r6, P1305_H0 (CTX)
+ ld r7, P1305_H1 (CTX)
+ ld r8, P1305_H2 (CTX)
- BLOCK_R64(F0,F1,H0,H1,H2)
+ BLOCK_R64(CTX,DATA,PADBYTE,r6,v0)
li r10, P1305_H1
- xxswapd VSR(F0), VSR(F0)
- xxswapd VSR(F1), VSR(F1)
- stxsd F0, P1305_H0 (CTX)
- stxvd2x VSR(F1), r10, CTX
+ xxswapd VSR(v0), VSR(v0)
+ xxswapd VSR(v1), VSR(v1)
+ stxsd v0, P1305_H0 (CTX)
+ stxvd2x VSR(v1), r10, CTX
blr
EPILOGUE(_nettle_poly1305_block)
diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4
index 3cb63f82..6a07ed6f 100644
--- a/powerpc64/p9/poly1305.m4
+++ b/powerpc64/p9/poly1305.m4
@@ -2,64 +2,75 @@ C Threshold of processing multiple blocks in parallel
C of a multiple of 4
define(`POLY1305_BLOCK_THRESHOLD', `12')
-C Argments
-define(`CTX', `r3')
-define(`DATA', `r4')
-define(`PADBYTE', `r5') C Padding byte register
-define(`LEN', `r6')
-
+C DEFINES_BLOCK_R64(GPR0, VR0)
define(`DEFINES_BLOCK_R64', `
- define(`T0', `r9')
- define(`T1', `r10')
- define(`T2', `r8')
- define(`T2A', `r9')
- define(`T2S', `r10')
- define(`RZ', `r6')
- define(`IDX', `r10')
-
- define(`ZERO', `v0')
- define(`F0S', `v3')
- define(`F11', `v4')
- define(`T', `v5')
-
- define(`R', `v6')
- define(`S', `v7')
-
- define(`T00', `v8')
- define(`T10', `v9')
- define(`T11', `v10')
- define(`MU0', `v11')
- define(`MU1', `v12')
+ define(`H0', `eval(0+$1)')
+ define(`H1', `eval(1+$1)')
+ define(`H2', `eval(2+$1)')
+
+ define(`T0', `eval(3+$1)')
+ define(`T1', `eval(4+$1)')
+ define(`T2', `eval(2+$1)')
+ define(`T2A', `eval(3+$1)')
+ define(`T2S', `eval(4+$1)')
+ define(`RZ', `eval(0+$1)')
+ define(`IDX', `eval(4+$1)')
+
+ define(`F0', `eval(0+$2)')
+ define(`F1', `eval(1+$2)')
+
+ define(`ZERO', `eval(2+$2)')
+ define(`F0S', `eval(3+$2)')
+ define(`F11', `eval(4+$2)')
+ define(`T', `eval(5+$2)')
+
+ define(`R', `eval(6+$2)')
+ define(`S', `eval(7+$2)')
+
+ define(`T00', `eval(8+$2)')
+ define(`T10', `eval(9+$2)')
+ define(`T11', `eval(10+$2)')
+ define(`MU0', `eval(11+$2)')
+ define(`MU1', `eval(12+$2)')
')
-C Inputs H0, H1, H2 are general-puropse registers of previous state radix 2^64
-C Outputs F0, F1 are vector registers of result state radix 2^64 sorted as follows
+C CTX is the address of context where key and pre-computed values are stored
+C DATA is the address of input block
+C PADBYTE is padding byte for input block
+C GPR0 is the starting register of sequential general-purpose registers
+C used in the macro of following layout
+C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64
+C GPR3, GPR4 are temporary registers
+C VR0 is the starting register of sequential vector resigers used in
+C the macro of following layout
+C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows
C (low 64-bit of F0) + (low 64-bit of F1) + (high 64-bit of F1)
-C BLOCK_R64(F0, F1, H0, H1, H2)
+C VR2..VR12 are temporary registers
+C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0)
define(`BLOCK_R64', `
- DEFINES_BLOCK_R64()
+ DEFINES_BLOCK_R64($4,$5)
C Load 128-bit input block
IF_LE(`
- ld T0, 0(DATA)
- ld T1, 8(DATA)
+ ld T0, 0($2)
+ ld T1, 8($2)
')
IF_BE(`
li IDX, 8
- ldbrx T1, IDX, DATA
- ldbrx T0, 0, DATA
+ ldbrx T1, IDX, $2
+ ldbrx T0, 0, $2
')
C Combine state with input block, latter is padded to 17-bytes
C by low-order byte of PADBYTE register
- addc T0, T0, $3
- adde T1, T1, $4
- adde T2, PADBYTE, $5
+ addc T0, T0, H0
+ adde T1, T1, H1
+ adde T2, $3, H2
mtvsrdd VSR(T), T0, T1
C Load key and pre-computed values
li IDX, 16
- lxvd2x VSR(R), 0, CTX
- lxvd2x VSR(S), IDX, CTX
+ lxvd2x VSR(R), 0, $1
+ lxvd2x VSR(S), IDX, $1
andi. T2A, T2, 3
srdi T2S, T2, 2
@@ -75,17 +86,17 @@ IF_BE(`
mtvsrdd VSR(T10), 0, T2
C Mutiplicate key by combined state and block
- vmsumudm $1, T, MU0, ZERO
- vmsumudm $2, T, MU1, ZERO
+ vmsumudm F0, T, MU0, ZERO
+ vmsumudm F1, T, MU1, ZERO
vmsumudm F11, T11, MU1, ZERO
- vmsumudm $1, T00, S, $1
- vmsumudm $2, T10, MU0, $2
+ vmsumudm F0, T00, S, F0
+ vmsumudm F1, T10, MU0, F1
C Product addition
xxmrgld VSR(F11), VSR(F11), VSR(ZERO)
- vadduqm $2, $2, F11
+ vadduqm F1, F1, F11
- xxmrghd VSR(F0S), VSR(ZERO), VSR($1)
- vadduqm $2, $2, F0S
+ xxmrghd VSR(F0S), VSR(ZERO), VSR(F0)
+ vadduqm F1, F1, F0S
')