summaryrefslogtreecommitdiff
path: root/mpn/powerpc64/mode64/mod_1_1.asm
blob: f24ceb2c8c0ba5fca5730c5f59a935fe29914959 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
dnl  PowerPC-64 mpn_mod_1_1p

dnl  Copyright 2010, 2011 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C                   cycles/limb
C POWER3/PPC630          ?
C POWER4/PPC970         17
C POWER5                16
C POWER6                30
C POWER7                10.2

C TODO
C  * Optimise, in particular the cps function.  This was compiler-generated and
C    then hand optimised.

C INPUT PARAMETERS
define(`ap',  `r3')
define(`n',   `r4')
define(`d',   `r5')
define(`cps', `r6')

ASM_START()

EXTERN_FUNC(mpn_invert_limb)

PROLOGUE(mpn_mod_1_1p)
	sldi	r10, r4, 3
	addi	r4, r4, -1
	add	r3, r3, r10
	ld	r0, 16(r6)		C B1modb
	ld	r12, 24(r6)		C B2modb
	ld	r9, -8(r3)
	ld	r10, -16(r3)
	mtctr	r4
	mulhdu	r8, r9, r0
	mulld	r7, r9, r0
	addc	r11, r7, r10
	addze	r9, r8
	bdz	L(end)

	ALIGN(16)
L(top):	ld	r4, -24(r3)
	addi	r3, r3, -8
	nop
	mulld	r10, r11, r0
	mulld	r8, r9, r12
	mulhdu	r11, r11, r0
	mulhdu	r9, r9, r12
	addc	r7, r10, r4
	addze	r10, r11
	addc	r11, r8, r7
	adde	r9, r9, r10
	bdnz	L(top)

L(end):	lwz	r0, 12(r6)
	ld	r3, 0(r6)
	cmpdi	cr7, r0, 0
	beq-	cr7, L(4)
	subfic	r10, r0, 64
	sld	r9, r9, r0
	srd	r10, r11, r10
	or	r9, r10, r9
L(4):	subfc	r10, r5, r9
	subfe	r10, r10, r10
	nand	r10, r10, r10
	sld	r11, r11, r0
	and	r10, r10, r5
	subf	r9, r10, r9
	mulhdu	r10, r9, r3
	mulld	r3, r9, r3
	addi	r9, r9, 1
	addc	r8, r3, r11
	adde	r3, r10, r9
	mulld	r3, r3, r5
	subf	r3, r3, r11
	cmpld	cr7, r8, r3
	bge	cr7, L(5)		C FIXME: Make branch-less
	add	r3, r3, r5
L(5):	cmpld	cr7, r3, r5
	bge-	cr7, L(10)
	srd	r3, r3, r0
	blr

L(10):	subf	r3, r5, r3
	srd	r3, r3, r0
	blr
EPILOGUE()

PROLOGUE(mpn_mod_1_1p_cps)
	mflr	r0
	std	r29, -24(r1)
	std	r30, -16(r1)
	std	r31, -8(r1)
	cntlzd	r31, r4
	std	r0, 16(r1)
	extsw	r31, r31
	mr	r29, r3
	stdu	r1, -144(r1)
	sld	r30, r4, r31
	mr	r3, r30
	CALL(	mpn_invert_limb)
	nop
	cmpdi	cr7, r31, 0
	neg	r0, r30
	beq-	cr7, L(13)
	subfic	r11, r31, 64
	li	r0, 1
	neg	r9, r30
	srd	r11, r3, r11
	sld	r0, r0, r31
	or	r0, r11, r0
	mulld	r0, r0, r9
L(13):	mulhdu	r9, r0, r3
	mulld	r11, r0, r3
	add	r9, r0, r9
	nor	r9, r9, r9
	mulld	r9, r9, r30
	cmpld	cr7, r11, r9
	bge	cr7, L(14)
	add	r9, r9, r30
L(14):	addi	r1, r1, 144
	srd	r0, r0, r31
	std	r31, 8(r29)
	std	r3, 0(r29)
	std	r0, 16(r29)
	ld	r0, 16(r1)
	srd	r9, r9, r31
	ld	r30, -16(r1)
	ld	r31, -8(r1)
	std	r9, 24(r29)
	ld	r29, -24(r1)
	mtlr	r0
	blr
EPILOGUE()