summaryrefslogtreecommitdiff
path: root/powerpc64/p9/poly1305.m4
blob: 7fc495b4df64372163db1cf04f03ba0277d8737f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
C Threshold of processing multiple blocks in parallel
C of a multiple of 4
define(`POLY1305_BLOCK_THRESHOLD', `12')

C DEFINES_BLOCK_R64(GPR0, VR0)
define(`DEFINES_BLOCK_R64', `
	define(`H0', `$1')
	define(`H1', `INC_GPR($1,1)')
	define(`H2', `INC_GPR($1,2)')
	
	define(`T0', `INC_GPR($1,3)')
	define(`T1', `INC_GPR($1,4)')
	define(`T2', `H2')
	define(`T2A', `INC_GPR($1,3)')
	define(`T2S', `INC_GPR($1,4)')
	define(`RZ', `H0')
	define(`IDX', `INC_GPR($1,4)')

	define(`F0', `$2')
	define(`F1', `INC_VR($2,1)')
	
	define(`ZERO', `INC_VR($2,2)')
	define(`F0S', `INC_VR($2,3)')
	define(`F11', `INC_VR($2,4)')
	define(`T', `INC_VR($2,5)')

	define(`R', `INC_VR($2,6)')
	define(`S', `INC_VR($2,7)')

	define(`T00', `INC_VR($2,8)')
	define(`T10', `INC_VR($2,9)')
	define(`T11', `INC_VR($2,10)')
	define(`MU0', `INC_VR($2,11)')
	define(`MU1', `INC_VR($2,12)')
	')

C CTX is the address of context where key and pre-computed values are stored
C DATA is the address of input block
C PADBYTE is padding byte for input block
C GPR0 is the starting register of sequential general-purpose registers 
C used in the macro of following layout
C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64
C GPR3, GPR4 are temporary registers
C VR0 is the starting register of sequential vector resigers used in
C the macro of following layout
C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows
C (low 64-bit of VR0) + (low 64-bit of VR1) + (high 64-bit of VR1)
C VR2..VR12 are temporary registers
C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0)
define(`BLOCK_R64', `
	DEFINES_BLOCK_R64($4,$5)
	C Load 128-bit input block
IF_LE(`
	ld			T0, 0($2)
	ld			T1, 8($2)
')
IF_BE(`
	li			IDX, 8
	ldbrx		T1, IDX, $2
	ldbrx		T0, 0, $2
')
	C Combine state with input block, latter is padded to 17-bytes 
	C by low-order byte of PADBYTE register
	addc		T0, T0, H0
	adde		T1, T1, H1
	adde		T2, $3, H2

	mtvsrdd		VSR(T), T0, T1

	C Load key and pre-computed values
	li			IDX, 16
	lxvd2x		VSR(R), 0, $1
	lxvd2x		VSR(S), IDX, $1

	andi.		T2A, T2, 3
	srdi		T2S, T2, 2

	li			RZ, 0
	vxor		ZERO, ZERO, ZERO

	xxpermdi	VSR(MU0), VSR(R), VSR(S), 0b01
	xxswapd		VSR(MU1), VSR(R)

	mtvsrdd		VSR(T11), 0, T2A
	mtvsrdd		VSR(T00), T2S, RZ
	mtvsrdd		VSR(T10), 0, T2

	C Multiply key by combined state and block
	vmsumudm	F0, T, MU0, ZERO
	vmsumudm	F1, T, MU1, ZERO
	vmsumudm	F11, T11, MU1, ZERO

	vmsumudm	F0, T00, S, F0
	vmsumudm	F1, T10, MU0, F1

	C Product addition
	xxmrgld		VSR(F11), VSR(F11), VSR(ZERO)
	vadduqm		F1, F1, F11

	xxmrghd		VSR(F0S), VSR(ZERO), VSR(F0)
	vadduqm		F1, F1, F0S
	')