1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
|
C powerpc64/p9/poly1305-blocks.asm
ifelse(`
Copyright (C) 2013, 2022 Niels Möller
Copyright (C) 2022 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
include_src(`powerpc64/p9/poly1305.m4')
C Register usage:
define(`SP', `r1')
define(`TOCP', `r2')
C Argments
define(`CTX', `r3')
define(`BLOCKS', `r4')
define(`DATA', `r5')
define(`PADBYTE', `r6') C Padding byte register
define(`DEFINES_BLOCK_R44', `
define(`R0', `v0')
define(`R1', `v1')
define(`R2', `v2')
define(`S1', `v3')
define(`S2', `v4')
define(`H0', `v5')
define(`H1', `v6')
define(`H2', `v7')
define(`R3', `v8')
define(`R4', `v9')
define(`R5', `v10')
define(`S4', `v11')
define(`S5', `v12')
define(`T0', `v13')
define(`T1', `v14')
define(`T2', `v15')
define(`T3', `v16')
define(`T4', `v17')
define(`T5', `v18')
define(`TMP', `v19')
define(`TMP2', `v20')
define(`ZERO', `v21')
define(`MASK44', `v22')
define(`MASK42L', `v23')
define(`MASK44L', `v24')
define(`T4PAD', `v25')
define(`D40', `v26')
define(`D20', `v27')
define(`D24', `v28')
define(`D44', `v29')
define(`D2', `v30')
define(`D4', `v31')
')
C Compute S_1 = 20 * R_1 and S_2 = 20 * R_2
C COMPUTE_S(S1, S2, R1, R2)
define(`COMPUTE_S', `
vsld $1, $3, D2
vsld $2, $4, D2
vaddudm $1, $1, $3
vaddudm $2, $2, $4
vsld $1, $1, D2
vsld $2, $2, D2
')
C Convert two-part radix 2^64 to three-part radix 2^44 of four blocks
C R64_TO_R44_4B(VR0, VR1, VR2, VR3, VR4, VR5)
define(`R64_TO_R44_4B', `
vsrd $3, $2, D24
vsrd $6, $5, D24
vsrd TMP, $1, D44
vsrd TMP2, $4, D44
vsld $2, $2, D20
vsld $5, $5, D20
vor $2, $2, TMP
vor $5, $5, TMP2
vand $1, $1, MASK44
vand $4, $4, MASK44
vand $2, $2, MASK44
vand $5, $5, MASK44
')
C T_0 = R_0 H_0 + S_2 H_1 + S_1 H_2
C T_1 = R_1 H_0 + R_0 H_1 + S_2 H_2
C T_2 = R_2 H_0 + R_1 H_1 + R_0 H_2
C MUL(T0, T1, T2, H0, H1, H2)
define(`MUL', `
vmsumudm $1, $4, R0, ZERO
vmsumudm $2, $4, R1, ZERO
vmsumudm $3, $4, R2, ZERO
vmsumudm $1, $5, S2, $1
vmsumudm $2, $5, R0, $2
vmsumudm $3, $5, R1, $3
vmsumudm $1, $6, S1, $1
vmsumudm $2, $6, S2, $2
vmsumudm $3, $6, R0, $3
')
C Apply aforenamed equations on four-blocks
C Each two successive blocks are interleaved horizontally
C MUL_4B(T0, T1, T2, H0, H1, H2, H3, H4, H5)
define(`MUL_4B', `
vmsumudm $1, $7, R0, ZERO
vmsumudm $2, $7, R1, ZERO
vmsumudm $3, $7, R2, ZERO
vmsumudm $1, $8, S2, $1
vmsumudm $2, $8, R0, $2
vmsumudm $3, $8, R1, $3
vmsumudm $1, $9, S1, $1
vmsumudm $2, $9, S2, $2
vmsumudm $3, $9, R0, $3
vmsumudm $1, $4, R3, $1
vmsumudm $2, $4, R4, $2
vmsumudm $3, $4, R5, $3
vmsumudm $1, $5, S5, $1
vmsumudm $2, $5, R3, $2
vmsumudm $3, $5, R4, $3
vmsumudm $1, $6, S4, $1
vmsumudm $2, $6, S5, $2
vmsumudm $3, $6, R3, $3
')
C Reduction phase of two interleaved chains
C RED(H0, H1, H2, T0, T1, T2)
define(`RED', `
vand $1, $4, MASK44L
vsro $4, $4, D40
vsrd $4, $4, D4
vadduqm $5, $5, $4
vand $2, $5, MASK44L
vsro $5, $5, D40
vsrd $5, $5, D4
vadduqm $6, $6, $5
vand $3, $6, MASK42L
vsro $6, $6, D40
vsrd $6, $6, D2
vadduqm $1, $1, $6
vsld $6, $6, D2
vadduqm $1, $1, $6
vsrd TMP, $1, D44
vand $1, $1, MASK44L
vadduqm $2, $2, TMP
')
.text
C void _nettle_poly1305_blocks(struct poly1305_ctx *ctx,
C size_t length, const uint8_t *data)
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_poly1305_blocks)
C Save non-volatile vector registers
std r31,-8(SP)
stxv VSR(v31),-32(SP)
stxv VSR(v30),-48(SP)
stxv VSR(v29),-64(SP)
stxv VSR(v28),-80(SP)
stxv VSR(v27),-96(SP)
stxv VSR(v26),-112(SP)
stxv VSR(v25),-128(SP)
stxv VSR(v24),-144(SP)
stxv VSR(v23),-160(SP)
stxv VSR(v22),-176(SP)
stxv VSR(v21),-192(SP)
stxv VSR(v20),-208(SP)
C Initialize padding byte register
li PADBYTE, 1
C Process data blocks of number of multiple 4
DEFINES_BLOCK_R44()
cmpldi BLOCKS, POLY1305_BLOCK_THRESHOLD
blt Ldata_r64
srdi r9, BLOCKS, 2
andi. BLOCKS, BLOCKS, 3
mtctr r9
C Initialize constants
vxor ZERO, ZERO, ZERO
vspltisb D2, 2
vspltisb D4, 4
addis r9, TOCP, .mask44@got@ha
ld r9, .mask44@got@l(r9)
lxvd2x VSR(MASK44), 0, r9
addi r9, r9, 16
lxvd2x VSR(MASK42L), 0, r9
addi r9, r9, 16
lxvd2x VSR(D40), 0, r9
addi r9, r9, 16
lxvd2x VSR(D20), 0, r9
addi r9, r9, 16
lxvd2x VSR(D24), 0, r9
addi r9, r9, 16
lxvd2x VSR(D44), 0, r9
xxmrghd VSR(MASK44L), VSR(ZERO), VSR(MASK44)
sldi r10, PADBYTE, 40
mtvsrdd VSR(T4PAD), r10, r10
C Load key of radix 2^44
lxsd R0, 0(CTX)
lxsd R1, 8(CTX)
vsrd R2, R1, D24
vsrd TMP, R0, D44
vsld R1, R1, D20
vor R1, R1, TMP
vand R0, R0, MASK44
vand R1, R1, MASK44
xxmrghd VSR(R0), VSR(R0), VSR(ZERO)
xxmrghd VSR(R1), VSR(R1), VSR(ZERO)
xxmrghd VSR(R2), VSR(R2), VSR(ZERO)
COMPUTE_S(S1, S2, R1, R2)
C Calculate R^2 = R R
MUL(T0, T1, T2, R0, R1, R2)
RED(H0, H1, H2, T0, T1, T2)
xxpermdi VSR(R0), VSR(R0), VSR(H0), 0b01
xxpermdi VSR(R1), VSR(R1), VSR(H1), 0b01
xxpermdi VSR(R2), VSR(R2), VSR(H2), 0b01
COMPUTE_S(S1, S2, R1, R2)
C Calculate R^3 = R^2 R
xxmrghd VSR(R3), VSR(ZERO), VSR(R0)
xxmrghd VSR(R4), VSR(ZERO), VSR(R1)
xxmrghd VSR(R5), VSR(ZERO), VSR(R2)
MUL(T0, T1, T2, R3, R4, R5)
RED(H0, H1, H2, T0, T1, T2)
C Calculate R^4 = R^2 R^2
xxmrgld VSR(R3), VSR(ZERO), VSR(R0)
xxmrgld VSR(R4), VSR(ZERO), VSR(R1)
xxmrgld VSR(R5), VSR(ZERO), VSR(R2)
MUL(T0, T1, T2, R3, R4, R5)
RED(R3, R4, R5, T0, T1, T2)
xxmrgld VSR(R3), VSR(H0), VSR(R3)
xxmrgld VSR(R4), VSR(H1), VSR(R4)
xxmrgld VSR(R5), VSR(H2), VSR(R5)
COMPUTE_S(S4, S5, R4, R5)
C Load state
ld r7, 32(CTX)
ld r8, 40(CTX)
ld r31, 48(CTX)
C Fold high part of H2
srdi r9, r31, 2
sldi r10, r9, 2
add r10, r10, r9
andi. r31, r31, 3
li r9, 0
addc r7, r7, r10
adde r8, r8, r9
adde r31, r31, r9
mtvsrdd VSR(H0), 0, r7
mtvsrdd VSR(H1), 0, r8
mtvsrdd VSR(H2), 0, r31
C Convert state of radix 2^64 to 2^44
vsrd TMP, H1, D24
vsld H2, H2, D40
vor H2, H2, TMP
vsrd TMP2, H0, D44
vsld H1, H1, D20
vor H1, H1, TMP2
vand H0, H0, MASK44
vand H1, H1, MASK44
li r8, 0x10
li r9, 0x20
li r10, 0x30
L4B_loop:
C Load four blocks
lxvd2x VSR(T3), 0, DATA
lxvd2x VSR(T4), r8, DATA
lxvd2x VSR(T5), r9, DATA
lxvd2x VSR(TMP), r10, DATA
IF_BE(`
xxbrd VSR(T3), VSR(T3)
xxbrd VSR(T4), VSR(T4)
xxbrd VSR(T5), VSR(T5)
xxbrd VSR(TMP), VSR(TMP)
')
C Permute blocks in little-endian and line each two successive
C blocks horizontally
xxmrghd VSR(T0), VSR(T4), VSR(T3)
xxmrgld VSR(T1), VSR(T4), VSR(T3)
xxmrghd VSR(T3), VSR(TMP), VSR(T5)
xxmrgld VSR(T4), VSR(TMP), VSR(T5)
R64_TO_R44_4B(T0, T1, T2, T3, T4, T5)
vor T2, T2, T4PAD
vor T5, T5, T4PAD
C Combine first block with previous state
vaddudm H0, H0, T0
vaddudm H1, H1, T1
vaddudm H2, H2, T2
MUL_4B(T0, T1, T2, H0, H1, H2, T3, T4, T5)
RED(H0, H1, H2, T0, T1, T2)
addi DATA, DATA, 64
bdnz L4B_loop
C Moving carry
vsrd TMP, H1, D44
vaddudm H2, H2, TMP
vsrd TMP2, H2, D40
vsrd TMP2, TMP2, D2
vsld TMP, TMP2, D2
vand H1, H1, MASK44
vaddudm TMP2, TMP2, TMP
vaddudm H0, H0, TMP2
vsrd TMP, H0, D44
vaddudm H1, H1, TMP
vand H2, H2, MASK42L
vand H0, H0, MASK44
C Convert state of radix 2^44 to 2^64
vsld TMP, H1, D44
vor H0, H0, TMP
vsrd H1, H1, D20
vsld TMP2, H2, D24
vor H1, H1, TMP2
vsrd H2, H2, D40
xxswapd VSR(H0), VSR(H0)
xxswapd VSR(H1), VSR(H1)
xxswapd VSR(H2), VSR(H2)
C Store state
stxsd H0, 32(CTX)
stxsd H1, 40(CTX)
stxsd H2, 48(CTX)
Ldata_r64:
cmpldi BLOCKS, 0
beq Ldone
mtctr BLOCKS
mr r4, PADBYTE
ld r6, P1305_H0 (CTX)
ld r7, P1305_H1 (CTX)
ld r8, P1305_H2 (CTX)
L1B_loop:
BLOCK_R64(CTX,DATA,r4,r6,v0)
mfvsrld r6, VSR(v0)
mfvsrld r7, VSR(v1)
mfvsrd r8, VSR(v1)
addi DATA, DATA, 16
bdnz L1B_loop
std r6, P1305_H0 (CTX)
std r7, P1305_H1 (CTX)
std r8, P1305_H2 (CTX)
Ldone:
C Restore non-volatile vector registers
ld r31, -8(SP)
lxv VSR(v31),-32(SP)
lxv VSR(v30),-48(SP)
lxv VSR(v29),-64(SP)
lxv VSR(v28),-80(SP)
lxv VSR(v27),-96(SP)
lxv VSR(v26),-112(SP)
lxv VSR(v25),-128(SP)
lxv VSR(v24),-144(SP)
lxv VSR(v23),-160(SP)
lxv VSR(v22),-176(SP)
lxv VSR(v21),-192(SP)
lxv VSR(v20),-208(SP)
mr r3, DATA
blr
EPILOGUE(_nettle_poly1305_blocks)
.rodata
.align 4
.mask44:
.quad 0x00000FFFFFFFFFFF,0x00000FFFFFFFFFFF
.mask42l:
.quad 0x0000000000000000,0x000003FFFFFFFFFF
.d40:
.quad 0x0000000000000028,0x0000000000000028
.d20:
.quad 0x0000000000000014,0x0000000000000014
.d24:
.quad 0x0000000000000018,0x0000000000000018
.d44:
.quad 0x000000000000002C,0x000000000000002C
|