summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-11-09 11:26:54 +0100
committerNiels Möller <nisse@lysator.liu.se>2022-11-09 20:54:32 +0100
commit974bc5e1c068d16cf7ddaf7f4a4cf48b1efe4f98 (patch)
tree24781997091fd9308ba77d118d6eab9652361d3a
parent0cecedba16a7f5fd6d87ea49871afd64983cdf44 (diff)
parentf0eeeb3c1e78f6db8fda078f35409873dc17db56 (diff)
downloadnettle-974bc5e1c068d16cf7ddaf7f4a4cf48b1efe4f98.tar.gz
Merge branch 'nettle-ppc-poly1305-multi' into master
See merge request nettle/nettle!56
-rw-r--r--configure.ac1
-rw-r--r--fat-ppc.c13
-rw-r--r--fat-setup.h2
-rw-r--r--poly1305-update.c15
-rw-r--r--powerpc64/fat/poly1305-blocks.asm38
-rw-r--r--powerpc64/machine.m412
-rw-r--r--powerpc64/p9/poly1305-blocks.asm434
-rw-r--r--powerpc64/p9/poly1305-internal.asm94
-rw-r--r--powerpc64/p9/poly1305.m4102
9 files changed, 630 insertions, 81 deletions
diff --git a/configure.ac b/configure.ac
index d1f30ae8..9533190a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -765,6 +765,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_poly1305_block
#undef HAVE_NATIVE_poly1305_digest
#undef HAVE_NATIVE_poly1305_blocks
+#undef HAVE_NATIVE_fat_poly1305_blocks
#undef HAVE_NATIVE_ghash_set_key
#undef HAVE_NATIVE_ghash_update
#undef HAVE_NATIVE_salsa20_core
diff --git a/fat-ppc.c b/fat-ppc.c
index 7569e44d..b95365f6 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -195,6 +195,11 @@ DECLARE_FAT_FUNC(_nettle_poly1305_digest, poly1305_digest_func)
DECLARE_FAT_FUNC_VAR(poly1305_digest, poly1305_digest_func, c)
DECLARE_FAT_FUNC_VAR(poly1305_digest, poly1305_digest_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func)
+DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
+DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
+
+
static void CONSTRUCTOR
fat_init (void)
{
@@ -251,12 +256,14 @@ fat_init (void)
_nettle_poly1305_set_key_vec = _nettle_poly1305_set_key_ppc64;
_nettle_poly1305_block_vec = _nettle_poly1305_block_ppc64;
_nettle_poly1305_digest_vec = _nettle_poly1305_digest_ppc64;
+ _nettle_poly1305_blocks_vec = _nettle_poly1305_blocks_ppc64;
}
else
{
_nettle_poly1305_set_key_vec = _nettle_poly1305_set_key_c;
_nettle_poly1305_block_vec = _nettle_poly1305_block_c;
_nettle_poly1305_digest_vec = _nettle_poly1305_digest_c;
+ _nettle_poly1305_blocks_vec = _nettle_poly1305_blocks_c;
}
}
@@ -315,3 +322,9 @@ DEFINE_FAT_FUNC(_nettle_poly1305_digest, void,
(struct poly1305_ctx *ctx,
union nettle_block16 *s),
(ctx, s))
+
+DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
+ (struct poly1305_ctx *ctx,
+ size_t blocks,
+ const uint8_t *m),
+ (ctx, blocks, m))
diff --git a/fat-setup.h b/fat-setup.h
index f9c35451..6bf3e2fa 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -203,6 +203,8 @@ typedef void poly1305_set_key_func(struct poly1305_ctx *ctx, const uint8_t *key)
typedef void poly1305_digest_func(struct poly1305_ctx *ctx, union nettle_block16 *s);
typedef void poly1305_block_func(struct poly1305_ctx *ctx, const uint8_t *m,
unsigned high);
+typedef const uint8_t * poly1305_blocks_func(struct poly1305_ctx *ctx, size_t blocks,
+ const uint8_t *m);
struct aes128_ctx;
typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key);
diff --git a/poly1305-update.c b/poly1305-update.c
index fdc72558..15ee3231 100644
--- a/poly1305-update.c
+++ b/poly1305-update.c
@@ -37,6 +37,21 @@
#include "poly1305-internal.h"
#include "md-internal.h"
+#if HAVE_NATIVE_fat_poly1305_blocks
+const uint8_t *
+_nettle_poly1305_blocks_c(struct poly1305_ctx *ctx,
+ size_t blocks, const uint8_t *m);
+
+const uint8_t *
+_nettle_poly1305_blocks_c(struct poly1305_ctx *ctx,
+ size_t blocks, const uint8_t *m)
+{
+ for (; blocks; blocks--, m += POLY1305_BLOCK_SIZE)
+ _nettle_poly1305_block(ctx, m, 1);
+ return m;
+}
+#endif
+
unsigned
_nettle_poly1305_update (struct poly1305_ctx *ctx,
uint8_t *block, unsigned index,
diff --git a/powerpc64/fat/poly1305-blocks.asm b/powerpc64/fat/poly1305-blocks.asm
new file mode 100644
index 00000000..9efef0a0
--- /dev/null
+++ b/powerpc64/fat/poly1305-blocks.asm
@@ -0,0 +1,38 @@
+C powerpc64/fat/poly1305-blocks.asm
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_poly1305_blocks)
+dnl PROLOGUE(_nettle_fat_poly1305_blocks)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p9/poly1305-blocks.asm')
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4
index b59f0863..8f28f295 100644
--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -51,3 +51,15 @@ forloop(i,0,63,`deflit(`vs'i,i)')
forloop(i,0,31,`deflit(`f'i,i)')
forloop(i,0,7, `deflit(`cr'i,i)')
')
+
+C Increase index of general-purpose register by specific value
+C INC_GPR(GPR, INC)
+define(`INC_GPR',`ifelse(substr($1,0,1),`r',
+``r'eval($2+substr($1,1,len($1)))',
+`eval($2+$1)')')
+
+C Increase index of vector register by specific value
+C INC_VR(VR, INC)
+define(`INC_VR',`ifelse(substr($1,0,1),`v',
+``v'eval($2+substr($1,1,len($1)))',
+`eval($2+$1)')')
diff --git a/powerpc64/p9/poly1305-blocks.asm b/powerpc64/p9/poly1305-blocks.asm
new file mode 100644
index 00000000..90e3df7b
--- /dev/null
+++ b/powerpc64/p9/poly1305-blocks.asm
@@ -0,0 +1,434 @@
+C powerpc64/p9/poly1305-blocks.asm
+
+ifelse(`
+ Copyright (C) 2013, 2022 Niels Möller
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+include_src(`powerpc64/p9/poly1305.m4')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Argments
+define(`CTX', `r3')
+define(`BLOCKS', `r4')
+define(`DATA', `r5')
+
+define(`PADBYTE', `r6') C Padding byte register
+
+define(`DEFINES_BLOCK_R44', `
+ define(`R0', `v0')
+ define(`R1', `v1')
+ define(`R2', `v2')
+ define(`S1', `v3')
+ define(`S2', `v4')
+ define(`H0', `v5')
+ define(`H1', `v6')
+ define(`H2', `v7')
+
+ define(`R3', `v8')
+ define(`R4', `v9')
+ define(`R5', `v10')
+ define(`S4', `v11')
+ define(`S5', `v12')
+
+ define(`T0', `v13')
+ define(`T1', `v14')
+ define(`T2', `v15')
+ define(`T3', `v16')
+ define(`T4', `v17')
+ define(`T5', `v18')
+ define(`TMP', `v19')
+ define(`TMP2', `v20')
+
+ define(`ZERO', `v21')
+ define(`MASK44', `v22')
+ define(`MASK42L', `v23')
+ define(`MASK44L', `v24')
+ define(`T4PAD', `v25')
+ define(`D40', `v26')
+ define(`D20', `v27')
+ define(`D24', `v28')
+ define(`D44', `v29')
+ define(`D2', `v30')
+ define(`D4', `v31')
+ ')
+
+C Compute S_1 = 20 * R_1 and S_2 = 20 * R_2
+C COMPUTE_S(S1, S2, R1, R2)
+define(`COMPUTE_S', `
+ vsld $1, $3, D2
+ vsld $2, $4, D2
+ vaddudm $1, $1, $3
+ vaddudm $2, $2, $4
+ vsld $1, $1, D2
+ vsld $2, $2, D2
+ ')
+
+C Convert two-part radix 2^64 to three-part radix 2^44 of four blocks
+C R64_TO_R44_4B(VR0, VR1, VR2, VR3, VR4, VR5)
+define(`R64_TO_R44_4B', `
+ vsrd $3, $2, D24
+ vsrd $6, $5, D24
+ vsrd TMP, $1, D44
+ vsrd TMP2, $4, D44
+ vsld $2, $2, D20
+ vsld $5, $5, D20
+ vor $2, $2, TMP
+ vor $5, $5, TMP2
+ vand $1, $1, MASK44
+ vand $4, $4, MASK44
+ vand $2, $2, MASK44
+ vand $5, $5, MASK44
+ ')
+
+C T_0 = R_0 H_0 + S_2 H_1 + S_1 H_2
+C T_1 = R_1 H_0 + R_0 H_1 + S_2 H_2
+C T_2 = R_2 H_0 + R_1 H_1 + R_0 H_2
+C MUL(T0, T1, T2, H0, H1, H2)
+define(`MUL', `
+ vmsumudm $1, $4, R0, ZERO
+ vmsumudm $2, $4, R1, ZERO
+ vmsumudm $3, $4, R2, ZERO
+
+ vmsumudm $1, $5, S2, $1
+ vmsumudm $2, $5, R0, $2
+ vmsumudm $3, $5, R1, $3
+
+ vmsumudm $1, $6, S1, $1
+ vmsumudm $2, $6, S2, $2
+ vmsumudm $3, $6, R0, $3
+ ')
+
+C Apply aforenamed equations on four-blocks
+C Each two successive blocks are interleaved horizontally
+C MUL_4B(T0, T1, T2, H0, H1, H2, H3, H4, H5)
+define(`MUL_4B', `
+ vmsumudm $1, $7, R0, ZERO
+ vmsumudm $2, $7, R1, ZERO
+ vmsumudm $3, $7, R2, ZERO
+
+ vmsumudm $1, $8, S2, $1
+ vmsumudm $2, $8, R0, $2
+ vmsumudm $3, $8, R1, $3
+
+ vmsumudm $1, $9, S1, $1
+ vmsumudm $2, $9, S2, $2
+ vmsumudm $3, $9, R0, $3
+
+ vmsumudm $1, $4, R3, $1
+ vmsumudm $2, $4, R4, $2
+ vmsumudm $3, $4, R5, $3
+
+ vmsumudm $1, $5, S5, $1
+ vmsumudm $2, $5, R3, $2
+ vmsumudm $3, $5, R4, $3
+
+ vmsumudm $1, $6, S4, $1
+ vmsumudm $2, $6, S5, $2
+ vmsumudm $3, $6, R3, $3
+ ')
+
+C Reduction phase of two interleaved chains
+C RED(H0, H1, H2, T0, T1, T2)
+define(`RED', `
+ vand $1, $4, MASK44L
+ vsro $4, $4, D40
+ vsrd $4, $4, D4
+ vadduqm $5, $5, $4
+ vand $2, $5, MASK44L
+ vsro $5, $5, D40
+ vsrd $5, $5, D4
+ vadduqm $6, $6, $5
+ vand $3, $6, MASK42L
+ vsro $6, $6, D40
+ vsrd $6, $6, D2
+ vadduqm $1, $1, $6
+ vsld $6, $6, D2
+ vadduqm $1, $1, $6
+ vsrd TMP, $1, D44
+ vand $1, $1, MASK44L
+ vadduqm $2, $2, TMP
+ ')
+
+.text
+
+C void _nettle_poly1305_blocks(struct poly1305_ctx *ctx,
+C size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_poly1305_blocks)
+ C Save non-volatile vector registers
+ std r31,-8(SP)
+ stxv VSR(v31),-32(SP)
+ stxv VSR(v30),-48(SP)
+ stxv VSR(v29),-64(SP)
+ stxv VSR(v28),-80(SP)
+ stxv VSR(v27),-96(SP)
+ stxv VSR(v26),-112(SP)
+ stxv VSR(v25),-128(SP)
+ stxv VSR(v24),-144(SP)
+ stxv VSR(v23),-160(SP)
+ stxv VSR(v22),-176(SP)
+ stxv VSR(v21),-192(SP)
+ stxv VSR(v20),-208(SP)
+
+ C Initialize padding byte register
+ li PADBYTE, 1
+
+C Process data blocks of number of multiple 4
+ DEFINES_BLOCK_R44()
+ cmpldi BLOCKS, POLY1305_BLOCK_THRESHOLD
+ blt Ldata_r64
+ srdi r9, BLOCKS, 2
+ andi. BLOCKS, BLOCKS, 3
+ mtctr r9
+
+ C Initialize constants
+
+ vxor ZERO, ZERO, ZERO
+ vspltisb D2, 2
+ vspltisb D4, 4
+ addis r9, TOCP, .mask44@got@ha
+ ld r9, .mask44@got@l(r9)
+ lxvd2x VSR(MASK44), 0, r9
+ addi r9, r9, 16
+ lxvd2x VSR(MASK42L), 0, r9
+ addi r9, r9, 16
+ lxvd2x VSR(D40), 0, r9
+ addi r9, r9, 16
+ lxvd2x VSR(D20), 0, r9
+ addi r9, r9, 16
+ lxvd2x VSR(D24), 0, r9
+ addi r9, r9, 16
+ lxvd2x VSR(D44), 0, r9
+ xxmrghd VSR(MASK44L), VSR(ZERO), VSR(MASK44)
+
+ sldi r10, PADBYTE, 40
+ mtvsrdd VSR(T4PAD), r10, r10
+
+ C Load key of radix 2^44
+ lxsd R0, 0(CTX)
+ lxsd R1, 8(CTX)
+ vsrd R2, R1, D24
+ vsrd TMP, R0, D44
+ vsld R1, R1, D20
+ vor R1, R1, TMP
+ vand R0, R0, MASK44
+ vand R1, R1, MASK44
+ xxmrghd VSR(R0), VSR(R0), VSR(ZERO)
+ xxmrghd VSR(R1), VSR(R1), VSR(ZERO)
+ xxmrghd VSR(R2), VSR(R2), VSR(ZERO)
+
+ COMPUTE_S(S1, S2, R1, R2)
+
+ C Calculate R^2 = R R
+
+ MUL(T0, T1, T2, R0, R1, R2)
+ RED(H0, H1, H2, T0, T1, T2)
+ xxpermdi VSR(R0), VSR(R0), VSR(H0), 0b01
+ xxpermdi VSR(R1), VSR(R1), VSR(H1), 0b01
+ xxpermdi VSR(R2), VSR(R2), VSR(H2), 0b01
+
+ COMPUTE_S(S1, S2, R1, R2)
+
+ C Calculate R^3 = R^2 R
+
+ xxmrghd VSR(R3), VSR(ZERO), VSR(R0)
+ xxmrghd VSR(R4), VSR(ZERO), VSR(R1)
+ xxmrghd VSR(R5), VSR(ZERO), VSR(R2)
+
+ MUL(T0, T1, T2, R3, R4, R5)
+ RED(H0, H1, H2, T0, T1, T2)
+
+ C Calculate R^4 = R^2 R^2
+
+ xxmrgld VSR(R3), VSR(ZERO), VSR(R0)
+ xxmrgld VSR(R4), VSR(ZERO), VSR(R1)
+ xxmrgld VSR(R5), VSR(ZERO), VSR(R2)
+
+ MUL(T0, T1, T2, R3, R4, R5)
+ RED(R3, R4, R5, T0, T1, T2)
+ xxmrgld VSR(R3), VSR(H0), VSR(R3)
+ xxmrgld VSR(R4), VSR(H1), VSR(R4)
+ xxmrgld VSR(R5), VSR(H2), VSR(R5)
+
+ COMPUTE_S(S4, S5, R4, R5)
+
+ C Load state
+ ld r7, 32(CTX)
+ ld r8, 40(CTX)
+ ld r31, 48(CTX)
+
+ C Fold high part of H2
+ srdi r9, r31, 2
+ sldi r10, r9, 2
+ add r10, r10, r9
+ andi. r31, r31, 3
+ li r9, 0
+ addc r7, r7, r10
+ adde r8, r8, r9
+ adde r31, r31, r9
+
+ mtvsrdd VSR(H0), 0, r7
+ mtvsrdd VSR(H1), 0, r8
+ mtvsrdd VSR(H2), 0, r31
+
+ C Convert state of radix 2^64 to 2^44
+ vsrd TMP, H1, D24
+ vsld H2, H2, D40
+ vor H2, H2, TMP
+ vsrd TMP2, H0, D44
+ vsld H1, H1, D20
+ vor H1, H1, TMP2
+ vand H0, H0, MASK44
+ vand H1, H1, MASK44
+
+ li r8, 0x10
+ li r9, 0x20
+ li r10, 0x30
+L4B_loop:
+ C Load four blocks
+ lxvd2x VSR(T3), 0, DATA
+ lxvd2x VSR(T4), r8, DATA
+ lxvd2x VSR(T5), r9, DATA
+ lxvd2x VSR(TMP), r10, DATA
+IF_BE(`
+ xxbrd VSR(T3), VSR(T3)
+ xxbrd VSR(T4), VSR(T4)
+ xxbrd VSR(T5), VSR(T5)
+ xxbrd VSR(TMP), VSR(TMP)
+')
+ C Permute blocks in little-endian and line each two successive
+ C blocks horizontally
+ xxmrghd VSR(T0), VSR(T4), VSR(T3)
+ xxmrgld VSR(T1), VSR(T4), VSR(T3)
+ xxmrghd VSR(T3), VSR(TMP), VSR(T5)
+ xxmrgld VSR(T4), VSR(TMP), VSR(T5)
+ R64_TO_R44_4B(T0, T1, T2, T3, T4, T5)
+ vor T2, T2, T4PAD
+ vor T5, T5, T4PAD
+
+ C Combine first block with previous state
+ vaddudm H0, H0, T0
+ vaddudm H1, H1, T1
+ vaddudm H2, H2, T2
+
+ MUL_4B(T0, T1, T2, H0, H1, H2, T3, T4, T5)
+ RED(H0, H1, H2, T0, T1, T2)
+
+ addi DATA, DATA, 64
+ bdnz L4B_loop
+
+ C Moving carry
+ vsrd TMP, H1, D44
+ vaddudm H2, H2, TMP
+ vsrd TMP2, H2, D40
+ vsrd TMP2, TMP2, D2
+ vsld TMP, TMP2, D2
+ vand H1, H1, MASK44
+ vaddudm TMP2, TMP2, TMP
+ vaddudm H0, H0, TMP2
+ vsrd TMP, H0, D44
+ vaddudm H1, H1, TMP
+ vand H2, H2, MASK42L
+ vand H0, H0, MASK44
+
+ C Convert state of radix 2^44 to 2^64
+ vsld TMP, H1, D44
+ vor H0, H0, TMP
+ vsrd H1, H1, D20
+ vsld TMP2, H2, D24
+ vor H1, H1, TMP2
+ vsrd H2, H2, D40
+
+ xxswapd VSR(H0), VSR(H0)
+ xxswapd VSR(H1), VSR(H1)
+ xxswapd VSR(H2), VSR(H2)
+
+ C Store state
+ stxsd H0, 32(CTX)
+ stxsd H1, 40(CTX)
+ stxsd H2, 48(CTX)
+
+Ldata_r64:
+ cmpldi BLOCKS, 0
+ beq Ldone
+ mtctr BLOCKS
+ mr r4, PADBYTE
+ ld r6, P1305_H0 (CTX)
+ ld r7, P1305_H1 (CTX)
+ ld r8, P1305_H2 (CTX)
+L1B_loop:
+ BLOCK_R64(CTX,DATA,r4,r6,v0)
+ mfvsrld r6, VSR(v0)
+ mfvsrld r7, VSR(v1)
+ mfvsrd r8, VSR(v1)
+ addi DATA, DATA, 16
+ bdnz L1B_loop
+ std r6, P1305_H0 (CTX)
+ std r7, P1305_H1 (CTX)
+ std r8, P1305_H2 (CTX)
+
+Ldone:
+ C Restore non-volatile vector registers
+ ld r31, -8(SP)
+ lxv VSR(v31),-32(SP)
+ lxv VSR(v30),-48(SP)
+ lxv VSR(v29),-64(SP)
+ lxv VSR(v28),-80(SP)
+ lxv VSR(v27),-96(SP)
+ lxv VSR(v26),-112(SP)
+ lxv VSR(v25),-128(SP)
+ lxv VSR(v24),-144(SP)
+ lxv VSR(v23),-160(SP)
+ lxv VSR(v22),-176(SP)
+ lxv VSR(v21),-192(SP)
+ lxv VSR(v20),-208(SP)
+
+ mr r3, DATA
+
+ blr
+EPILOGUE(_nettle_poly1305_blocks)
+
+.rodata
+.align 4
+.mask44:
+.quad 0x00000FFFFFFFFFFF,0x00000FFFFFFFFFFF
+.mask42l:
+.quad 0x0000000000000000,0x000003FFFFFFFFFF
+.d40:
+.quad 0x0000000000000028,0x0000000000000028
+.d20:
+.quad 0x0000000000000014,0x0000000000000014
+.d24:
+.quad 0x0000000000000018,0x0000000000000018
+.d44:
+.quad 0x000000000000002C,0x000000000000002C
diff --git a/powerpc64/p9/poly1305-internal.asm b/powerpc64/p9/poly1305-internal.asm
index 18804ca8..c23e16fd 100644
--- a/powerpc64/p9/poly1305-internal.asm
+++ b/powerpc64/p9/poly1305-internal.asm
@@ -30,6 +30,8 @@ ifelse(`
not, see http://www.gnu.org/licenses/.
')
+include_src(`powerpc64/p9/poly1305.m4')
+
C Register usage:
define(`SP', `r1')
@@ -37,36 +39,8 @@ define(`TOCP', `r2')
C Argments
define(`CTX', `r3')
-define(`M', `r4')
-define(`M128', `r5')
-
-C Working state
-define(`H0', `r6')
-define(`H1', `r7')
-define(`H2', `r8')
-define(`T0', `r9')
-define(`T1', `r10')
-define(`T2', `r8')
-define(`T2A', `r9')
-define(`T2S', `r10')
-define(`IDX', `r6')
-define(`RZ', `r7')
-
-define(`ZERO', `v0')
-define(`F0', `v1')
-define(`F1', `v2')
-define(`F0S', `v3')
-define(`T', `v4')
-
-define(`R', `v5')
-define(`S', `v6')
-
-define(`T00', `v7')
-define(`T10', `v8')
-define(`T11', `v9')
-define(`MU0', `v10')
-define(`MU1', `v11')
-define(`TMP', `v12')
+define(`DATA', `r4')
+define(`PADBYTE', `r5') C Padding byte register
.text
@@ -114,59 +88,17 @@ EPILOGUE(_nettle_poly1305_set_key)
C void _nettle_poly1305_block(struct poly1305_ctx *ctx, const uint8_t *m, unsigned m128)
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_poly1305_block)
- ld H0, P1305_H0 (CTX)
- ld H1, P1305_H1 (CTX)
- ld H2, P1305_H2 (CTX)
-IF_LE(`
- ld T0, 0(M)
- ld T1, 8(M)
-')
-IF_BE(`
- ldbrx T0, 0, M
- addi M, M, 8
- ldbrx T1, 0, M
-')
-
- addc T0, T0, H0
- adde T1, T1, H1
- adde T2, M128, H2
-
- mtvsrdd VSR(T), T0, T1
-
- li IDX, P1305_S0
- lxvd2x VSR(R), 0, CTX
- lxvd2x VSR(S), IDX, CTX
-
- andi. T2A, T2, 3
- srdi T2S, T2, 2
-
- li RZ, 0
- vxor ZERO, ZERO, ZERO
-
- xxpermdi VSR(MU0), VSR(R), VSR(S), 0b01
- xxswapd VSR(MU1), VSR(R)
-
- mtvsrdd VSR(T11), 0, T2A
- mtvsrdd VSR(T00), T2S, RZ
- mtvsrdd VSR(T10), 0, T2
-
- vmsumudm F0, T, MU0, ZERO
- vmsumudm F1, T, MU1, ZERO
- vmsumudm TMP, T11, MU1, ZERO
-
- vmsumudm F0, T00, S, F0
- vmsumudm F1, T10, MU0, F1
+ ld r6, P1305_H0 (CTX)
+ ld r7, P1305_H1 (CTX)
+ ld r8, P1305_H2 (CTX)
- xxmrgld VSR(TMP), VSR(TMP), VSR(ZERO)
- xxswapd VSR(F0S), VSR(F0)
- vadduqm F1, F1, TMP
- stxsd F0S, P1305_H0 (CTX)
+ BLOCK_R64(CTX,DATA,PADBYTE,r6,v0)
- li IDX, P1305_H1
- xxmrghd VSR(F0), VSR(ZERO), VSR(F0)
- vadduqm F1, F1, F0
- xxswapd VSR(F1), VSR(F1)
- stxvd2x VSR(F1), IDX, CTX
+ li r10, P1305_H1
+ xxswapd VSR(v0), VSR(v0)
+ xxswapd VSR(v1), VSR(v1)
+ stxsd v0, P1305_H0 (CTX)
+ stxvd2x VSR(v1), r10, CTX
blr
EPILOGUE(_nettle_poly1305_block)
diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4
new file mode 100644
index 00000000..13a57e83
--- /dev/null
+++ b/powerpc64/p9/poly1305.m4
@@ -0,0 +1,102 @@
+C Threshold of processing multiple blocks in parallel
+C of a multiple of 4
+define(`POLY1305_BLOCK_THRESHOLD', `12')
+
+C DEFINES_BLOCK_R64(GPR0, VR0)
+define(`DEFINES_BLOCK_R64', `
+ define(`H0', `$1')
+ define(`H1', `INC_GPR($1,1)')
+ define(`H2', `INC_GPR($1,2)')
+
+ define(`T0', `INC_GPR($1,3)')
+ define(`T1', `INC_GPR($1,4)')
+ define(`T2', `H2')
+ define(`T2A', `INC_GPR($1,3)')
+ define(`T2S', `INC_GPR($1,4)')
+ define(`RZ', `H0')
+ define(`IDX', `INC_GPR($1,4)')
+
+ define(`F0', `$2')
+ define(`F1', `INC_VR($2,1)')
+
+ define(`ZERO', `INC_VR($2,2)')
+ define(`F0S', `INC_VR($2,3)')
+ define(`F11', `INC_VR($2,4)')
+ define(`T', `INC_VR($2,5)')
+
+ define(`R', `INC_VR($2,6)')
+ define(`S', `INC_VR($2,7)')
+
+ define(`T00', `INC_VR($2,8)')
+ define(`T10', `INC_VR($2,9)')
+ define(`T11', `INC_VR($2,10)')
+ define(`MU0', `INC_VR($2,11)')
+ define(`MU1', `INC_VR($2,12)')
+ ')
+
+C CTX is the address of context where key and pre-computed values are stored
+C DATA is the address of input block
+C PADBYTE is padding byte for input block
+C GPR0 is the starting register of sequential general-purpose registers
+C used in the macro of following layout
+C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64
+C GPR3, GPR4 are temporary registers
+C VR0 is the starting register of sequential vector resigers used in
+C the macro of following layout
+C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows
+C (low 64-bit of VR0) + (low 64-bit of VR1) + (high 64-bit of VR1)
+C VR2..VR12 are temporary registers
+C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0)
+define(`BLOCK_R64', `
+ DEFINES_BLOCK_R64($4,$5)
+ C Load 128-bit input block
+IF_LE(`
+ ld T0, 0($2)
+ ld T1, 8($2)
+')
+IF_BE(`
+ li IDX, 8
+ ldbrx T1, IDX, $2
+ ldbrx T0, 0, $2
+')
+ C Combine state with input block, latter is padded to 17-bytes
+ C by low-order byte of PADBYTE register
+ addc T0, T0, H0
+ adde T1, T1, H1
+ adde T2, $3, H2
+
+ mtvsrdd VSR(T), T0, T1
+
+ C Load key and pre-computed values
+ li IDX, 16
+ lxvd2x VSR(R), 0, $1
+ lxvd2x VSR(S), IDX, $1
+
+ andi. T2A, T2, 3
+ srdi T2S, T2, 2
+
+ li RZ, 0
+ vxor ZERO, ZERO, ZERO
+
+ xxpermdi VSR(MU0), VSR(R), VSR(S), 0b01
+ xxswapd VSR(MU1), VSR(R)
+
+ mtvsrdd VSR(T11), 0, T2A
+ mtvsrdd VSR(T00), T2S, RZ
+ mtvsrdd VSR(T10), 0, T2
+
+ C Multiply key by combined state and block
+ vmsumudm F0, T, MU0, ZERO
+ vmsumudm F1, T, MU1, ZERO
+ vmsumudm F11, T11, MU1, ZERO
+
+ vmsumudm F0, T00, S, F0
+ vmsumudm F1, T10, MU0, F1
+
+ C Product addition
+ xxmrgld VSR(F11), VSR(F11), VSR(ZERO)
+ vadduqm F1, F1, F11
+
+ xxmrghd VSR(F0S), VSR(ZERO), VSR(F0)
+ vadduqm F1, F1, F0S
+ ')