summaryrefslogtreecommitdiff
path: root/powerpc64/p9/poly1305-blocks.asm
blob: cbd0350534342be932767913b42b9714588b2724 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
C powerpc64/p9/poly1305-blocks.asm

ifelse(`
   Copyright (C) 2013, 2022 Niels Möller
   Copyright (C) 2022 Mamone Tarsha
   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

     * the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.

   or

     * the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')

include_src(`powerpc64/p9/poly1305.m4')

C Register usage:

define(`SP', `r1')
define(`TOCP', `r2')

C Argments
define(`CTX', `r3')
define(`BLOCKS', `r4')
define(`DATA', `r5')

define(`PADBYTE', `r6') C Padding byte register

define(`DEFINES_BLOCK_R44', `
	define(`R0', `v0')
	define(`R1', `v1')
	define(`R2', `v2')
	define(`S1', `v3')
	define(`S2', `v4')
	define(`H0', `v5')
	define(`H1', `v6')
	define(`H2', `v7')

	define(`R3', `v8')
	define(`R4', `v9')
	define(`R5', `v10')
	define(`S4', `v11')
	define(`S5', `v12')

	define(`T0', `v13')
	define(`T1', `v14')
	define(`T2', `v15')
	define(`T3', `v16')
	define(`T4', `v17')
	define(`T5', `v18')
	define(`TMP', `v19')
	define(`TMP2', `v20')

	define(`ZERO', `v21')
	define(`MASK44', `v22')
	define(`MASK42L', `v23')
	define(`MASK44L', `v24')
	define(`T4PAD', `v25')
	define(`D40', `v26')
	define(`D20', `v27')
	define(`D24', `v28')
	define(`D44', `v29')
	define(`D2', `v30')
	define(`D4', `v31')
	')

C Compute S_1 = 20 * R_1 and S_2 = 20 * R_2
C COMPUTE_S(S1, S2, R1, R2)
define(`COMPUTE_S', `
	vsld		$1, $3, D2
	vsld		$2, $4, D2
	vaddudm		$1, $1, $3
	vaddudm		$2, $2, $4
	vsld		$1, $1, D2
	vsld		$2, $2, D2
	')

C Convert two-part radix 2^64 to three-part radix 2^44 of four blocks
C R64_TO_R44_4B(VR0, VR1, VR2, VR3, VR4, VR5)
define(`R64_TO_R44_4B', `
	vsrd		$3, $2, D24
	vsrd		$6, $5, D24
	vsrd		TMP, $1, D44
	vsrd		TMP2, $4, D44
	vsld		$2, $2, D20
	vsld		$5, $5, D20
	vor			$2, $2, TMP
	vor			$5, $5, TMP2
	vand		$1, $1, MASK44
	vand		$4, $4, MASK44
	vand		$2, $2, MASK44
	vand		$5, $5, MASK44
	')

C T_0 = R_0 H_0 + S_2 H_1 + S_1 H_2
C T_1 = R_1 H_0 + R_0 H_1 + S_2 H_2
C T_2 = R_2 H_0 + R_1 H_1 + R_0 H_2
C MUL(T0, T1, T2, H0, H1, H2)
define(`MUL', `
	vmsumudm	$1, $4, R0, ZERO
	vmsumudm	$2, $4, R1, ZERO
	vmsumudm	$3, $4, R2, ZERO

	vmsumudm	$1, $5, S2, $1
	vmsumudm	$2, $5, R0, $2
	vmsumudm	$3, $5, R1, $3

	vmsumudm	$1, $6, S1, $1
	vmsumudm	$2, $6, S2, $2
	vmsumudm	$3, $6, R0, $3
	')

C Apply aforenamed equations on four-blocks
C Each two successive blocks are interleaved horizontally
C MUL_4B(T0, T1, T2, H0, H1, H2, H3, H4, H5)
define(`MUL_4B', `
	vmsumudm	$1, $7, R0, ZERO
	vmsumudm	$2, $7, R1, ZERO
	vmsumudm	$3, $7, R2, ZERO

	vmsumudm	$1, $8, S2, $1
	vmsumudm	$2, $8, R0, $2
	vmsumudm	$3, $8, R1, $3

	vmsumudm	$1, $9, S1, $1
	vmsumudm	$2, $9, S2, $2
	vmsumudm	$3, $9, R0, $3

	vmsumudm	$1, $4, R3, $1
	vmsumudm	$2, $4, R4, $2
	vmsumudm	$3, $4, R5, $3

	vmsumudm	$1, $5, S5, $1
	vmsumudm	$2, $5, R3, $2
	vmsumudm	$3, $5, R4, $3

	vmsumudm	$1, $6, S4, $1
	vmsumudm	$2, $6, S5, $2
	vmsumudm	$3, $6, R3, $3
	')

C Reduction phase of two interleaved chains
C RED(H0, H1, H2, T0, T1, T2)
define(`RED', `
	vand		$1, $4, MASK44L
	vsro		$4, $4, D40
	vsrd		$4, $4, D4
	vadduqm		$5, $5, $4
	vand		$2, $5, MASK44L
	vsro		$5, $5, D40
	vsrd		$5, $5, D4
	vadduqm		$6, $6, $5
	vand		$3, $6, MASK42L
	vsro		$6, $6, D40
	vsrd		$6, $6, D2
	vadduqm		$1, $1, $6
	vsld		$6, $6, D2
	vadduqm		$1, $1, $6
	vsrd		TMP, $1, D44
	vand		$1, $1, MASK44L
	vadduqm		$2, $2, TMP
	')

.text

C void _nettle_poly1305_blocks(struct poly1305_ctx *ctx,
C 				size_t length, const uint8_t *data)
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_poly1305_blocks)
	C Save non-volatile vector registers
	std		r31,-8(SP)
	stxv	VSR(v31),-32(SP)
	stxv	VSR(v30),-48(SP)
	stxv	VSR(v29),-64(SP)
	stxv	VSR(v28),-80(SP)
	stxv	VSR(v27),-96(SP)
	stxv	VSR(v26),-112(SP)
	stxv	VSR(v25),-128(SP)
	stxv	VSR(v24),-144(SP)
	stxv	VSR(v23),-160(SP)
	stxv	VSR(v22),-176(SP)
	stxv	VSR(v21),-192(SP)
	stxv	VSR(v20),-208(SP)

	C Initialize padding byte register
	li		PADBYTE, 1

C Process data blocks of number of multiple 4
	DEFINES_BLOCK_R44()
	cmpldi	BLOCKS, POLY1305_BLOCK_THRESHOLD
	blt		Ldata_r64
	srdi	r9, BLOCKS, 2
	andi.	BLOCKS, BLOCKS, 3
	mtctr	r9

	C Initialize constants

	vxor 		ZERO, ZERO, ZERO
	vspltisb	D2, 2
	vspltisb	D4, 4
	addis		r9, TOCP, .mask44@got@ha
	ld			r9, .mask44@got@l(r9)
	lxvd2x		VSR(MASK44), 0, r9
	addi		r9, r9, 16
	lxvd2x		VSR(MASK42L), 0, r9
	addi		r9, r9, 16
	lxvd2x		VSR(D40), 0, r9
	addi		r9, r9, 16
	lxvd2x		VSR(D20), 0, r9
	addi		r9, r9, 16
	lxvd2x		VSR(D24), 0, r9
	addi		r9, r9, 16
	lxvd2x		VSR(D44), 0, r9
	xxmrghd		VSR(MASK44L), VSR(ZERO), VSR(MASK44)

	sldi		r10, PADBYTE, 40
	mtvsrdd		VSR(T4PAD), r10, r10

	C Load key of radix 2^44
	lxsd		R0, 0(CTX)
	lxsd		R1, 8(CTX)
	vsrd		R2, R1, D24
	vsrd		TMP, R0, D44
	vsld		R1, R1, D20
	vor			R1, R1, TMP
	vand		R0, R0, MASK44
	vand		R1, R1, MASK44
	xxmrghd		VSR(R0), VSR(R0), VSR(ZERO)
	xxmrghd		VSR(R1), VSR(R1), VSR(ZERO)
	xxmrghd		VSR(R2), VSR(R2), VSR(ZERO)

	COMPUTE_S(S1, S2, R1, R2)

	C Calculate R^2 = R R
	
	MUL(T0, T1, T2, R0, R1, R2)
	RED(H0, H1, H2, T0, T1, T2)
	xxpermdi	VSR(R0), VSR(R0), VSR(H0), 0b01
	xxpermdi	VSR(R1), VSR(R1), VSR(H1), 0b01
	xxpermdi	VSR(R2), VSR(R2), VSR(H2), 0b01

	COMPUTE_S(S1, S2, R1, R2)

	C Calculate R^3 = R^2 R
	
	xxmrghd		VSR(R3), VSR(ZERO), VSR(R0)
	xxmrghd		VSR(R4), VSR(ZERO), VSR(R1)
	xxmrghd		VSR(R5), VSR(ZERO), VSR(R2)

	MUL(T0, T1, T2, R3, R4, R5)
	RED(H0, H1, H2, T0, T1, T2)

	C Calculate R^4 = R^2 R^2
	
	xxmrgld		VSR(R3), VSR(ZERO), VSR(R0)
	xxmrgld		VSR(R4), VSR(ZERO), VSR(R1)
	xxmrgld		VSR(R5), VSR(ZERO), VSR(R2)

	MUL(T0, T1, T2, R3, R4, R5)
	RED(R3, R4, R5, T0, T1, T2)
	xxmrgld		VSR(R3), VSR(H0), VSR(R3)
	xxmrgld		VSR(R4), VSR(H1), VSR(R4)
	xxmrgld		VSR(R5), VSR(H2), VSR(R5)

	COMPUTE_S(S4, S5, R4, R5)

	C Load state
	ld			r7, 32(CTX)
	ld			r8, 40(CTX)
	ld			r31, 48(CTX)

	C Fold high part of H2
	srdi		r9, r31, 2
	sldi		r10, r9, 2
	add			r10, r10, r9
	andi.		r31, r31, 3
	li			r9, 0
	addc		r7, r7, r10
	adde		r8, r8, r9
	adde		r31, r31, r9

	mtvsrdd		VSR(H0), 0, r7
	mtvsrdd		VSR(H1), 0, r8
	mtvsrdd		VSR(H2), 0, r31

	C Convert state of radix 2^64 to 2^44
	vsrd		TMP, H1, D24
	vsld		H2, H2, D40
	vor			H2, H2, TMP
	vsrd		TMP2, H0, D44
	vsld		H1, H1, D20
	vor			H1, H1, TMP2
	vand		H0, H0, MASK44
	vand		H1, H1, MASK44

	li			r8, 0x10
	li			r9, 0x20
	li			r10, 0x30
L4B_loop:
	C Load four blocks
	lxvd2x		VSR(T3), 0, DATA
	lxvd2x		VSR(T4), r8, DATA
	lxvd2x		VSR(T5), r9, DATA
	lxvd2x		VSR(TMP), r10, DATA
IF_BE(`
	xxbrd		VSR(T3), VSR(T3)
	xxbrd		VSR(T4), VSR(T4)
	xxbrd		VSR(T5), VSR(T5)
	xxbrd		VSR(TMP), VSR(TMP)
')
	C Permute blocks in little-endian and line each two successive
	C blocks horizontally
	xxmrghd		VSR(T0), VSR(T4), VSR(T3)
	xxmrgld		VSR(T1), VSR(T4), VSR(T3)
	xxmrghd		VSR(T3), VSR(TMP), VSR(T5)
	xxmrgld		VSR(T4), VSR(TMP), VSR(T5)
	R64_TO_R44_4B(T0, T1, T2, T3, T4, T5)
	vor			T2, T2, T4PAD
	vor			T5, T5, T4PAD

	C Combine first block with previous state
	vaddudm		H0, H0, T0
	vaddudm		H1, H1, T1
	vaddudm		H2, H2, T2

	MUL_4B(T0, T1, T2, H0, H1, H2, T3, T4, T5)
	RED(H0, H1, H2, T0, T1, T2)

	addi		DATA, DATA, 64
	bdnz		L4B_loop

	C Moving carry
	vsrd		TMP, H1, D44
	vaddudm		H2, H2, TMP
	vsrd		TMP2, H2, D40
	vsrd		TMP2, TMP2, D2
	vsld		TMP, TMP2, D2
	vand		H1, H1, MASK44
	vaddudm		TMP2, TMP2, TMP
	vaddudm		H0, H0, TMP2
	vsrd		TMP, H0, D44
	vaddudm		H1, H1, TMP
	vand		H2, H2, MASK42L
	vand		H0, H0, MASK44

	C Convert state of radix 2^44 to 2^64
	vsld		TMP, H1, D44
	vor			H0, H0, TMP
	vsrd		H1, H1, D20
	vsld		TMP2, H2, D24
	vor			H1, H1, TMP2
	vsrd		H2, H2, D40

	xxswapd		VSR(H0), VSR(H0)
	xxswapd		VSR(H1), VSR(H1)
	xxswapd		VSR(H2), VSR(H2)

	C Store state
	stxsd		H0, 32(CTX)
	stxsd		H1, 40(CTX)
	stxsd		H2, 48(CTX)

Ldata_r64:
	cmpldi	BLOCKS, 0
	beq		Ldone
	mtctr	BLOCKS
	mr			r4, PADBYTE
	ld			r6, P1305_H0 (CTX)
	ld			r7, P1305_H1 (CTX)
	ld			r8, P1305_H2 (CTX)
L1B_loop:
	BLOCK_R64(CTX,DATA,r4,r6,v0)
	mfvsrld		r6, VSR(v0)
	mfvsrld		r7, VSR(v1)
	mfvsrd		r8, VSR(v1)
	addi	DATA, DATA, 16
	bdnz	L1B_loop
	std		r6, P1305_H0 (CTX)
	std		r7, P1305_H1 (CTX)
	std		r8, P1305_H2 (CTX)

Ldone:
	C Restore non-volatile vector registers
	ld		r31, -8(SP)
	lxv		VSR(v31),-32(SP)
	lxv		VSR(v30),-48(SP)
	lxv		VSR(v29),-64(SP)
	lxv		VSR(v28),-80(SP)
	lxv		VSR(v27),-96(SP)
	lxv		VSR(v26),-112(SP)
	lxv		VSR(v25),-128(SP)
	lxv		VSR(v24),-144(SP)
	lxv		VSR(v23),-160(SP)
	lxv		VSR(v22),-176(SP)
	lxv		VSR(v21),-192(SP)
	lxv		VSR(v20),-208(SP)

	mr		r3, DATA

	blr
EPILOGUE(_nettle_poly1305_blocks)

.rodata
.align 4
.mask44:
.quad 0x00000FFFFFFFFFFF,0x00000FFFFFFFFFFF
.mask42l:
.quad 0x0000000000000000,0x000003FFFFFFFFFF
.d40:
.quad 0x0000000000000028,0x0000000000000028
.d20:
.quad 0x0000000000000014,0x0000000000000014
.d24:
.quad 0x0000000000000018,0x0000000000000018
.d44:
.quad 0x000000000000002C,0x000000000000002C