diff options
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | fat-ppc.c | 13 | ||||
-rw-r--r-- | fat-setup.h | 2 | ||||
-rw-r--r-- | poly1305-update.c | 17 | ||||
-rw-r--r-- | powerpc64/fat/poly1305-blocks.asm | 36 | ||||
-rw-r--r-- | powerpc64/p9/poly1305-blocks.asm | 440 | ||||
-rw-r--r-- | powerpc64/p9/poly1305-internal.asm | 83 | ||||
-rw-r--r-- | powerpc64/p9/poly1305.m4 | 91 |
8 files changed, 608 insertions, 75 deletions
diff --git a/configure.ac b/configure.ac index 4e9dceda..c3f0bd8e 100644 --- a/configure.ac +++ b/configure.ac @@ -763,6 +763,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_poly1305_block #undef HAVE_NATIVE_poly1305_digest #undef HAVE_NATIVE_poly1305_blocks +#undef HAVE_NATIVE_fat_poly1305_blocks #undef HAVE_NATIVE_ghash_set_key #undef HAVE_NATIVE_ghash_update #undef HAVE_NATIVE_salsa20_core @@ -195,6 +195,11 @@ DECLARE_FAT_FUNC(_nettle_poly1305_digest, poly1305_digest_func) DECLARE_FAT_FUNC_VAR(poly1305_digest, poly1305_digest_func, c) DECLARE_FAT_FUNC_VAR(poly1305_digest, poly1305_digest_func, ppc64) +DECLARE_FAT_FUNC(_nettle_poly1305_blocks, poly1305_blocks_func) +DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c) +DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64) + + static void CONSTRUCTOR fat_init (void) { @@ -251,12 +256,14 @@ fat_init (void) _nettle_poly1305_set_key_vec = _nettle_poly1305_set_key_ppc64; _nettle_poly1305_block_vec = _nettle_poly1305_block_ppc64; _nettle_poly1305_digest_vec = _nettle_poly1305_digest_ppc64; + _nettle_poly1305_blocks_vec = _nettle_poly1305_blocks_ppc64; } else { _nettle_poly1305_set_key_vec = _nettle_poly1305_set_key_c; _nettle_poly1305_block_vec = _nettle_poly1305_block_c; _nettle_poly1305_digest_vec = _nettle_poly1305_digest_c; + _nettle_poly1305_blocks_vec = _nettle_poly1305_blocks_c; } } @@ -315,3 +322,9 @@ DEFINE_FAT_FUNC(_nettle_poly1305_digest, void, (struct poly1305_ctx *ctx, union nettle_block16 *s), (ctx, s)) + +DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *, + (struct poly1305_ctx *ctx, + size_t blocks, + const uint8_t *m), + (ctx, blocks, m)) diff --git a/fat-setup.h b/fat-setup.h index f9c35451..6bf3e2fa 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -203,6 +203,8 @@ typedef void poly1305_set_key_func(struct poly1305_ctx *ctx, const uint8_t *key) typedef void poly1305_digest_func(struct poly1305_ctx *ctx, union nettle_block16 *s); typedef void poly1305_block_func(struct poly1305_ctx *ctx, const uint8_t *m, unsigned high); +typedef const uint8_t * poly1305_blocks_func(struct poly1305_ctx *ctx, size_t blocks, + const uint8_t *m); struct aes128_ctx; typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key); diff --git a/poly1305-update.c b/poly1305-update.c index fdc72558..5493e17b 100644 --- a/poly1305-update.c +++ b/poly1305-update.c @@ -37,6 +37,21 @@ #include "poly1305-internal.h" #include "md-internal.h" +#if HAVE_NATIVE_fat_poly1305_blocks +const uint8_t * +_nettle_poly1305_blocks_c(struct poly1305_ctx *ctx, + size_t blocks, const uint8_t *m); + +const uint8_t * +_nettle_poly1305_blocks_c(struct poly1305_ctx *ctx, + size_t blocks, const uint8_t *m) +{ + for (; blocks; blocks--, m += POLY1305_BLOCK_SIZE) + _nettle_poly1305_block(ctx, m, 1); + return m; +} +#endif + unsigned _nettle_poly1305_update (struct poly1305_ctx *ctx, uint8_t *block, unsigned index, @@ -49,7 +64,7 @@ _nettle_poly1305_update (struct poly1305_ctx *ctx, length, m); _nettle_poly1305_block(ctx, block, 1); } -#if HAVE_NATIVE_poly1305_blocks +#if HAVE_NATIVE_poly1305_blocks || HAVE_NATIVE_fat_poly1305_blocks m = _nettle_poly1305_blocks (ctx, length >> 4, m); length &= 15; #else diff --git a/powerpc64/fat/poly1305-blocks.asm b/powerpc64/fat/poly1305-blocks.asm new file mode 100644 index 00000000..e6435f44 --- /dev/null +++ b/powerpc64/fat/poly1305-blocks.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/poly1305-blocks.asm + +ifelse(` + Copyright (C) 2022 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_fat_poly1305_blocks) picked up by configure + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p9/poly1305-blocks.asm') diff --git a/powerpc64/p9/poly1305-blocks.asm b/powerpc64/p9/poly1305-blocks.asm new file mode 100644 index 00000000..3f729e98 --- /dev/null +++ b/powerpc64/p9/poly1305-blocks.asm @@ -0,0 +1,440 @@ +C powerpc64/p9/poly1305-blocks.asm + +ifelse(` + Copyright (C) 2013, 2022 Niels Möller + Copyright (C) 2022 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +include_src(`powerpc64/p9/poly1305.m4') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`DEFINES_BLOCK_ARG_R64', ` + C State inputs + define(`H0', `r6') + define(`H1', `r7') + define(`H2', `r8') + C State outputs + define(`F0', `v1') + define(`F1', `v2') + ') + +define(`DEFINES_BLOCK_R44', ` + define(`R0', `v0') + define(`R1', `v1') + define(`R2', `v2') + define(`S1', `v3') + define(`S2', `v4') + define(`H0', `v5') + define(`H1', `v6') + define(`H2', `v7') + + define(`R3', `v8') + define(`R4', `v9') + define(`R5', `v10') + define(`S4', `v11') + define(`S5', `v12') + + define(`T0', `v13') + define(`T1', `v14') + define(`T2', `v15') + define(`T3', `v16') + define(`T4', `v17') + define(`T5', `v18') + define(`TMP', `v19') + define(`TMP2', `v20') + + define(`ZERO', `v21') + define(`MASK44', `v22') + define(`MASK42L', `v23') + define(`MASK44L', `v24') + define(`T4PAD', `v25') + define(`D40', `v26') + define(`D20', `v27') + define(`D24', `v28') + define(`D44', `v29') + define(`D2', `v30') + define(`D4', `v31') + ') + +C Compute S_1 = 20 * R_1 and S_2 = 20 * R_2 +C COMPUTE_S(S1, S2, R1, R2) +define(`COMPUTE_S', ` + vsld $1, $3, D2 + vsld $2, $4, D2 + vaddudm $1, $1, $3 + vaddudm $2, $2, $4 + vsld $1, $1, D2 + vsld $2, $2, D2 + ') + +C Convert two-part radix 2^64 to three-part radix 2^44 of four blocks +C R64_TO_R44_4B(VR0, VR1, VR2, VR3, VR4, VR5) +define(`R64_TO_R44_4B', ` + vsrd $3, $2, D24 + vsrd $6, $5, D24 + vsrd TMP, $1, D44 + vsrd TMP2, $4, D44 + vsld $2, $2, D20 + vsld $5, $5, D20 + vor $2, $2, TMP + vor $5, $5, TMP2 + vand $1, $1, MASK44 + vand $4, $4, MASK44 + vand $2, $2, MASK44 + vand $5, $5, MASK44 + ') + +C T_0 = R_0 H_0 + S_2 H_1 + S_1 H_2 +C T_1 = R_1 H_0 + R_0 H_1 + S_2 H_2 +C T_2 = R_2 H_0 + R_1 H_1 + R_0 H_2 +C MUL(T0, T1, T2, H0, H1, H2) +define(`MUL', ` + vmsumudm $1, $4, R0, ZERO + vmsumudm $2, $4, R1, ZERO + vmsumudm $3, $4, R2, ZERO + + vmsumudm $1, $5, S2, $1 + vmsumudm $2, $5, R0, $2 + vmsumudm $3, $5, R1, $3 + + vmsumudm $1, $6, S1, $1 + vmsumudm $2, $6, S2, $2 + vmsumudm $3, $6, R0, $3 + ') + +C Apply aforenamed equations on four-blocks +C Each two successive blocks are interleaved horizontally +C MUL_4B(T0, T1, T2, H0, H1, H2, H3, H4, H5) +define(`MUL_4B', ` + vmsumudm $1, $7, R0, ZERO + vmsumudm $2, $7, R1, ZERO + vmsumudm $3, $7, R2, ZERO + + vmsumudm $1, $8, S2, $1 + vmsumudm $2, $8, R0, $2 + vmsumudm $3, $8, R1, $3 + + vmsumudm $1, $9, S1, $1 + vmsumudm $2, $9, S2, $2 + vmsumudm $3, $9, R0, $3 + + vmsumudm $1, $4, R3, $1 + vmsumudm $2, $4, R4, $2 + vmsumudm $3, $4, R5, $3 + + vmsumudm $1, $5, S5, $1 + vmsumudm $2, $5, R3, $2 + vmsumudm $3, $5, R4, $3 + + vmsumudm $1, $6, S4, $1 + vmsumudm $2, $6, S5, $2 + vmsumudm $3, $6, R3, $3 + ') + +C Reduction phase of two interleaved chains +C RED(H0, H1, H2, T0, T1, T2) +define(`RED', ` + vand $1, $4, MASK44L + vsro $4, $4, D40 + vsrd $4, $4, D4 + vadduqm $5, $5, $4 + vand $2, $5, MASK44L + vsro $5, $5, D40 + vsrd $5, $5, D4 + vadduqm $6, $6, $5 + vand $3, $6, MASK42L + vsro $6, $6, D40 + vsrd $6, $6, D2 + vadduqm $1, $1, $6 + vsld $6, $6, D2 + vadduqm $1, $1, $6 + vsrd TMP, $1, D44 + vand $1, $1, MASK44L + vadduqm $2, $2, TMP + ') + +.text + +C void _nettle_poly1305_blocks(struct poly1305_ctx *ctx, +C size_t length, const uint8_t *data) +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_poly1305_blocks) + C Save non-volatile vector registers + std r31,-8(SP) + stxv VSR(v31),-32(SP) + stxv VSR(v30),-48(SP) + stxv VSR(v29),-64(SP) + stxv VSR(v28),-80(SP) + stxv VSR(v27),-96(SP) + stxv VSR(v26),-112(SP) + stxv VSR(v25),-128(SP) + stxv VSR(v24),-144(SP) + stxv VSR(v23),-160(SP) + stxv VSR(v22),-176(SP) + stxv VSR(v21),-192(SP) + stxv VSR(v20),-208(SP) + + mr LEN, r4 + mr DATA, r5 + C Initialize padding byte register + li PADBYTE, 1 + +C Process data blocks of number of multiple 4 + DEFINES_BLOCK_R44() + cmpldi LEN, POLY1305_BLOCK_THRESHOLD + blt Ldata_r64 + srdi r9, LEN, 2 + andi. LEN, LEN, 3 + mtctr r9 + + C Initialize constants + + vxor ZERO, ZERO, ZERO + vspltisb D2, 2 + vspltisb D4, 4 + addis r9, TOCP, .mask44@got@ha + ld r9, .mask44@got@l(r9) + lxvd2x VSR(MASK44), 0, r9 + addi r9, r9, 16 + lxvd2x VSR(MASK42L), 0, r9 + addi r9, r9, 16 + lxvd2x VSR(D40), 0, r9 + addi r9, r9, 16 + lxvd2x VSR(D20), 0, r9 + addi r9, r9, 16 + lxvd2x VSR(D24), 0, r9 + addi r9, r9, 16 + lxvd2x VSR(D44), 0, r9 + xxmrghd VSR(MASK44L), VSR(ZERO), VSR(MASK44) + + sldi r10, PADBYTE, 40 + mtvsrdd VSR(T4PAD), r10, r10 + + C Load key of radix 2^44 + lxsd R0, 0(CTX) + lxsd R1, 8(CTX) + vsrd R2, R1, D24 + vsrd TMP, R0, D44 + vsld R1, R1, D20 + vor R1, R1, TMP + vand R0, R0, MASK44 + vand R1, R1, MASK44 + xxmrghd VSR(R0), VSR(R0), VSR(ZERO) + xxmrghd VSR(R1), VSR(R1), VSR(ZERO) + xxmrghd VSR(R2), VSR(R2), VSR(ZERO) + + COMPUTE_S(S1, S2, R1, R2) + + C Calculate R^2 = R R + + MUL(T0, T1, T2, R0, R1, R2) + RED(H0, H1, H2, T0, T1, T2) + xxpermdi VSR(R0), VSR(R0), VSR(H0), 0b01 + xxpermdi VSR(R1), VSR(R1), VSR(H1), 0b01 + xxpermdi VSR(R2), VSR(R2), VSR(H2), 0b01 + + COMPUTE_S(S1, S2, R1, R2) + + C Calculate R^3 = R^2 R + + xxmrghd VSR(R3), VSR(ZERO), VSR(R0) + xxmrghd VSR(R4), VSR(ZERO), VSR(R1) + xxmrghd VSR(R5), VSR(ZERO), VSR(R2) + + MUL(T0, T1, T2, R3, R4, R5) + RED(H0, H1, H2, T0, T1, T2) + + C Calculate R^4 = R^2 R^2 + + xxmrgld VSR(R3), VSR(ZERO), VSR(R0) + xxmrgld VSR(R4), VSR(ZERO), VSR(R1) + xxmrgld VSR(R5), VSR(ZERO), VSR(R2) + + MUL(T0, T1, T2, R3, R4, R5) + RED(R3, R4, R5, T0, T1, T2) + xxmrgld VSR(R3), VSR(H0), VSR(R3) + xxmrgld VSR(R4), VSR(H1), VSR(R4) + xxmrgld VSR(R5), VSR(H2), VSR(R5) + + COMPUTE_S(S4, S5, R4, R5) + + C Load state + ld r7, 32(CTX) + ld r8, 40(CTX) + ld r31, 48(CTX) + + C Fold high part of H2 + srdi r9, r31, 2 + sldi r10, r9, 2 + add r10, r10, r9 + andi. r31, r31, 3 + li r9, 0 + addc r7, r7, r10 + adde r8, r8, r9 + adde r31, r31, r9 + + mtvsrdd VSR(H0), 0, r7 + mtvsrdd VSR(H1), 0, r8 + mtvsrdd VSR(H2), 0, r31 + + C Convert state of radix 2^64 to 2^44 + vsrd TMP, H1, D24 + vsld H2, H2, D40 + vor H2, H2, TMP + vsrd TMP2, H0, D44 + vsld H1, H1, D20 + vor H1, H1, TMP2 + vand H0, H0, MASK44 + vand H1, H1, MASK44 + + li r8, 0x10 + li r9, 0x20 + li r10, 0x30 +L4B_loop: + C Load four blocks + lxvd2x VSR(T3), 0, DATA + lxvd2x VSR(T4), r8, DATA + lxvd2x VSR(T5), r9, DATA + lxvd2x VSR(TMP), r10, DATA +IF_BE(` + xxbrd VSR(T3), VSR(T3) + xxbrd VSR(T4), VSR(T4) + xxbrd VSR(T5), VSR(T5) + xxbrd VSR(TMP), VSR(TMP) +') + C Permute blocks in little-endian and line each two successive + C blocks horizontally + xxmrghd VSR(T0), VSR(T4), VSR(T3) + xxmrgld VSR(T1), VSR(T4), VSR(T3) + xxmrghd VSR(T3), VSR(TMP), VSR(T5) + xxmrgld VSR(T4), VSR(TMP), VSR(T5) + R64_TO_R44_4B(T0, T1, T2, T3, T4, T5) + vor T2, T2, T4PAD + vor T5, T5, T4PAD + + C Combine first block with previous state + vaddudm H0, H0, T0 + vaddudm H1, H1, T1 + vaddudm H2, H2, T2 + + MUL_4B(T0, T1, T2, H0, H1, H2, T3, T4, T5) + RED(H0, H1, H2, T0, T1, T2) + + addi DATA, DATA, 64 + bdnz L4B_loop + + C Moving carry + vsrd TMP, H1, D44 + vaddudm H2, H2, TMP + vsrd TMP2, H2, D40 + vsrd TMP2, TMP2, D2 + vsld TMP, TMP2, D2 + vand H1, H1, MASK44 + vaddudm TMP2, TMP2, TMP + vaddudm H0, H0, TMP2 + vsrd TMP, H0, D44 + vaddudm H1, H1, TMP + vand H2, H2, MASK42L + vand H0, H0, MASK44 + + C Convert state of radix 2^44 to 2^64 + vsld TMP, H1, D44 + vor H0, H0, TMP + vsrd H1, H1, D20 + vsld TMP2, H2, D24 + vor H1, H1, TMP2 + vsrd H2, H2, D40 + + xxswapd VSR(H0), VSR(H0) + xxswapd VSR(H1), VSR(H1) + xxswapd VSR(H2), VSR(H2) + + C Store state + stxsd H0, 32(CTX) + stxsd H1, 40(CTX) + stxsd H2, 48(CTX) + +Ldata_r64: + DEFINES_BLOCK_ARG_R64() + C COUNTER = LEN / 16 + cmpldi LEN, 0 + beq Ldone + mtctr LEN + ld H0, P1305_H0 (CTX) + ld H1, P1305_H1 (CTX) + ld H2, P1305_H2 (CTX) +L1B_loop: + BLOCK_R64(F0,F1,H0,H1,H2) + mfvsrld H0, VSR(F0) + mfvsrld H1, VSR(F1) + mfvsrd H2, VSR(F1) + addi DATA, DATA, 16 + bdnz L1B_loop + std H0, P1305_H0 (CTX) + std H1, P1305_H1 (CTX) + std H2, P1305_H2 (CTX) + +Ldone: + C Restore non-volatile vector registers + ld r31, -8(SP) + lxv VSR(v31),-32(SP) + lxv VSR(v30),-48(SP) + lxv VSR(v29),-64(SP) + lxv VSR(v28),-80(SP) + lxv VSR(v27),-96(SP) + lxv VSR(v26),-112(SP) + lxv VSR(v25),-128(SP) + lxv VSR(v24),-144(SP) + lxv VSR(v23),-160(SP) + lxv VSR(v22),-176(SP) + lxv VSR(v21),-192(SP) + lxv VSR(v20),-208(SP) + + mr r3, DATA + + blr +EPILOGUE(_nettle_poly1305_blocks) + +.rodata +.align 4 +.mask44: +.quad 0x00000FFFFFFFFFFF,0x00000FFFFFFFFFFF +.mask42l: +.quad 0x0000000000000000,0x000003FFFFFFFFFF +.d40: +.quad 0x0000000000000028,0x0000000000000028 +.d20: +.quad 0x0000000000000014,0x0000000000000014 +.d24: +.quad 0x0000000000000018,0x0000000000000018 +.d44: +.quad 0x000000000000002C,0x000000000000002C diff --git a/powerpc64/p9/poly1305-internal.asm b/powerpc64/p9/poly1305-internal.asm index 18804ca8..a1e46e8f 100644 --- a/powerpc64/p9/poly1305-internal.asm +++ b/powerpc64/p9/poly1305-internal.asm @@ -30,43 +30,20 @@ ifelse(` not, see http://www.gnu.org/licenses/. ') +include_src(`powerpc64/p9/poly1305.m4') + C Register usage: define(`SP', `r1') define(`TOCP', `r2') -C Argments -define(`CTX', `r3') -define(`M', `r4') -define(`M128', `r5') - -C Working state +C State inputs define(`H0', `r6') define(`H1', `r7') define(`H2', `r8') -define(`T0', `r9') -define(`T1', `r10') -define(`T2', `r8') -define(`T2A', `r9') -define(`T2S', `r10') -define(`IDX', `r6') -define(`RZ', `r7') - -define(`ZERO', `v0') +C State outputs define(`F0', `v1') define(`F1', `v2') -define(`F0S', `v3') -define(`T', `v4') - -define(`R', `v5') -define(`S', `v6') - -define(`T00', `v7') -define(`T10', `v8') -define(`T11', `v9') -define(`MU0', `v10') -define(`MU1', `v11') -define(`TMP', `v12') .text @@ -117,56 +94,14 @@ PROLOGUE(_nettle_poly1305_block) ld H0, P1305_H0 (CTX) ld H1, P1305_H1 (CTX) ld H2, P1305_H2 (CTX) -IF_LE(` - ld T0, 0(M) - ld T1, 8(M) -') -IF_BE(` - ldbrx T0, 0, M - addi M, M, 8 - ldbrx T1, 0, M -') - - addc T0, T0, H0 - adde T1, T1, H1 - adde T2, M128, H2 - - mtvsrdd VSR(T), T0, T1 - - li IDX, P1305_S0 - lxvd2x VSR(R), 0, CTX - lxvd2x VSR(S), IDX, CTX - - andi. T2A, T2, 3 - srdi T2S, T2, 2 - - li RZ, 0 - vxor ZERO, ZERO, ZERO - - xxpermdi VSR(MU0), VSR(R), VSR(S), 0b01 - xxswapd VSR(MU1), VSR(R) - - mtvsrdd VSR(T11), 0, T2A - mtvsrdd VSR(T00), T2S, RZ - mtvsrdd VSR(T10), 0, T2 - - vmsumudm F0, T, MU0, ZERO - vmsumudm F1, T, MU1, ZERO - vmsumudm TMP, T11, MU1, ZERO - - vmsumudm F0, T00, S, F0 - vmsumudm F1, T10, MU0, F1 - xxmrgld VSR(TMP), VSR(TMP), VSR(ZERO) - xxswapd VSR(F0S), VSR(F0) - vadduqm F1, F1, TMP - stxsd F0S, P1305_H0 (CTX) + BLOCK_R64(F0,F1,H0,H1,H2) - li IDX, P1305_H1 - xxmrghd VSR(F0), VSR(ZERO), VSR(F0) - vadduqm F1, F1, F0 + li r10, P1305_H1 + xxswapd VSR(F0), VSR(F0) xxswapd VSR(F1), VSR(F1) - stxvd2x VSR(F1), IDX, CTX + stxsd F0, P1305_H0 (CTX) + stxvd2x VSR(F1), r10, CTX blr EPILOGUE(_nettle_poly1305_block) diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4 new file mode 100644 index 00000000..3cb63f82 --- /dev/null +++ b/powerpc64/p9/poly1305.m4 @@ -0,0 +1,91 @@ +C Threshold of processing multiple blocks in parallel +C of a multiple of 4 +define(`POLY1305_BLOCK_THRESHOLD', `12') + +C Argments +define(`CTX', `r3') +define(`DATA', `r4') +define(`PADBYTE', `r5') C Padding byte register +define(`LEN', `r6') + +define(`DEFINES_BLOCK_R64', ` + define(`T0', `r9') + define(`T1', `r10') + define(`T2', `r8') + define(`T2A', `r9') + define(`T2S', `r10') + define(`RZ', `r6') + define(`IDX', `r10') + + define(`ZERO', `v0') + define(`F0S', `v3') + define(`F11', `v4') + define(`T', `v5') + + define(`R', `v6') + define(`S', `v7') + + define(`T00', `v8') + define(`T10', `v9') + define(`T11', `v10') + define(`MU0', `v11') + define(`MU1', `v12') + ') + +C Inputs H0, H1, H2 are general-puropse registers of previous state radix 2^64 +C Outputs F0, F1 are vector registers of result state radix 2^64 sorted as follows +C (low 64-bit of F0) + (low 64-bit of F1) + (high 64-bit of F1) +C BLOCK_R64(F0, F1, H0, H1, H2) +define(`BLOCK_R64', ` + DEFINES_BLOCK_R64() + C Load 128-bit input block +IF_LE(` + ld T0, 0(DATA) + ld T1, 8(DATA) +') +IF_BE(` + li IDX, 8 + ldbrx T1, IDX, DATA + ldbrx T0, 0, DATA +') + C Combine state with input block, latter is padded to 17-bytes + C by low-order byte of PADBYTE register + addc T0, T0, $3 + adde T1, T1, $4 + adde T2, PADBYTE, $5 + + mtvsrdd VSR(T), T0, T1 + + C Load key and pre-computed values + li IDX, 16 + lxvd2x VSR(R), 0, CTX + lxvd2x VSR(S), IDX, CTX + + andi. T2A, T2, 3 + srdi T2S, T2, 2 + + li RZ, 0 + vxor ZERO, ZERO, ZERO + + xxpermdi VSR(MU0), VSR(R), VSR(S), 0b01 + xxswapd VSR(MU1), VSR(R) + + mtvsrdd VSR(T11), 0, T2A + mtvsrdd VSR(T00), T2S, RZ + mtvsrdd VSR(T10), 0, T2 + + C Mutiplicate key by combined state and block + vmsumudm $1, T, MU0, ZERO + vmsumudm $2, T, MU1, ZERO + vmsumudm F11, T11, MU1, ZERO + + vmsumudm $1, T00, S, $1 + vmsumudm $2, T10, MU0, $2 + + C Product addition + xxmrgld VSR(F11), VSR(F11), VSR(ZERO) + vadduqm $2, $2, F11 + + xxmrghd VSR(F0S), VSR(ZERO), VSR($1) + vadduqm $2, $2, F0S + ') |