1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
dnl PowerPC-64 mpn_mod_1_1p
dnl Copyright 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C POWER3/PPC630 ?
C POWER4/PPC970 17
C POWER5 16
C POWER6 30
C POWER7 10.2
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
C then hand optimised.
C INPUT PARAMETERS
define(`ap', `r3')
define(`n', `r4')
define(`d', `r5')
define(`cps', `r6')
ASM_START()
EXTERN_FUNC(mpn_invert_limb)
PROLOGUE(mpn_mod_1_1p)
sldi r10, r4, 3
addi r4, r4, -1
add r3, r3, r10
ld r0, 16(r6) C B1modb
ld r12, 24(r6) C B2modb
ld r9, -8(r3)
ld r10, -16(r3)
mtctr r4
mulhdu r8, r9, r0
mulld r7, r9, r0
addc r11, r7, r10
addze r9, r8
bdz L(end)
ALIGN(16)
L(top): ld r4, -24(r3)
addi r3, r3, -8
nop
mulld r10, r11, r0
mulld r8, r9, r12
mulhdu r11, r11, r0
mulhdu r9, r9, r12
addc r7, r10, r4
addze r10, r11
addc r11, r8, r7
adde r9, r9, r10
bdnz L(top)
L(end): lwz r0, 12(r6)
ld r3, 0(r6)
cmpdi cr7, r0, 0
beq- cr7, L(4)
subfic r10, r0, 64
sld r9, r9, r0
srd r10, r11, r10
or r9, r10, r9
L(4): subfc r10, r5, r9
subfe r10, r10, r10
nand r10, r10, r10
sld r11, r11, r0
and r10, r10, r5
subf r9, r10, r9
mulhdu r10, r9, r3
mulld r3, r9, r3
addi r9, r9, 1
addc r8, r3, r11
adde r3, r10, r9
mulld r3, r3, r5
subf r3, r3, r11
cmpld cr7, r8, r3
bge cr7, L(5) C FIXME: Make branch-less
add r3, r3, r5
L(5): cmpld cr7, r3, r5
bge- cr7, L(10)
srd r3, r3, r0
blr
L(10): subf r3, r5, r3
srd r3, r3, r0
blr
EPILOGUE()
PROLOGUE(mpn_mod_1_1p_cps)
mflr r0
std r29, -24(r1)
std r30, -16(r1)
std r31, -8(r1)
cntlzd r31, r4
std r0, 16(r1)
extsw r31, r31
mr r29, r3
stdu r1, -144(r1)
sld r30, r4, r31
mr r3, r30
CALL( mpn_invert_limb)
nop
cmpdi cr7, r31, 0
neg r0, r30
beq- cr7, L(13)
subfic r11, r31, 64
li r0, 1
neg r9, r30
srd r11, r3, r11
sld r0, r0, r31
or r0, r11, r0
mulld r0, r0, r9
L(13): mulhdu r9, r0, r3
mulld r11, r0, r3
add r9, r0, r9
nor r9, r9, r9
mulld r9, r9, r30
cmpld cr7, r11, r9
bge cr7, L(14)
add r9, r9, r30
L(14): addi r1, r1, 144
srd r0, r0, r31
std r31, 8(r29)
std r3, 0(r29)
std r0, 16(r29)
ld r0, 16(r1)
srd r9, r9, r31
ld r30, -16(r1)
ld r31, -8(r1)
std r9, 24(r29)
ld r29, -24(r1)
mtlr r0
blr
EPILOGUE()
|