ghc/rts/gmp/mpn/pa64/addmul_1.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
; add the result to a second limb vector.

; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.

; This file is part of the GNU MP Library.

; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published by
; the Free Software Foundation; either version 2.1 of the License, or (at your
; option) any later version.

; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
; License for more details.

; You should have received a copy of the GNU Lesser General Public License
; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
; MA 02111-1307, USA.

; INPUT PARAMETERS
#define rptr		%r26
#define sptr		%r25
#define size		%r24
#define s2limb		-56(%r30)

; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
; it faster, but the PA8000 pipeline is not publically documented and it
; is very complex to reverse engineer

#define t1 %r19
#define rlimb %r20
#define hi %r21
#define lo %r22
#define m0 %r28
#define m1 %r3
#define cylimb %r29
#define t3 %r4
#define t2 %r6
#define t5 %r23
#define t4 %r31
	.level  2.0n
	.code
	.export __gmpn_addmul_1,entry
__gmpn_addmul_1
	.proc
	.callinfo frame=128,no_calls
	.entry
        fldd		-56(%r30),%fr5		; s2limb passed on stack
	ldo		128(%r30),%r30
	add		%r0,%r0,cylimb		; clear cy and cylimb

	std		%r3,-96(%r30)
	std		%r4,-88(%r30)
	std		%r5,-80(%r30)
	std		%r6,-72(%r30)
	depdi,z		1,31,1,%r5

	fldd		0(sptr),%fr4
	ldo		8(sptr),sptr

	xmpyu		%fr5R,%fr4R,%fr6
	fstd		%fr6,-128(%r30)
	xmpyu		%fr5R,%fr4L,%fr7
	fstd		%fr7,-120(%r30)
	xmpyu		%fr5L,%fr4R,%fr8
	fstd		%fr8,-112(%r30)
	xmpyu		%fr5L,%fr4L,%fr9
	fstd		%fr9,-104(%r30)
	ldd		-128(%r30),lo		; lo = low 64 bit of product
	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
	ldd		-104(%r30),hi		; hi = high 64 bit of product
	addib,=		-1,%r24,L$end1
	nop
	fldd		0(sptr),%fr4
	ldo		8(sptr),sptr
	addib,=		-1,%r24,L$end2
	nop
L$loop
	xmpyu		%fr5R,%fr4R,%fr6
	fstd		%fr6,-128(%r30)
	xmpyu		%fr5R,%fr4L,%fr7
	fstd		%fr7,-120(%r30)
	xmpyu		%fr5L,%fr4R,%fr8
	fstd		%fr8,-112(%r30)
	xmpyu		%fr5L,%fr4L,%fr9
	fstd		%fr9,-104(%r30)
	ldd		0(rptr),rlimb
	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
	add,l		m0,t1,t1		; t1 += m0
	add,l,*nuv	m1,t1,t1		; t1 += m1
	 add,l		%r5,hi,hi		; propagate carry
	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
	depd,z		t1,31,32,t5		; t5 = lo32(t1)
	add,l		t5,t4,t4		; t4 += lo32(t1)
	ldd		-128(%r30),lo		; lo = low 64 bit of product
	add		cylimb,rlimb,rlimb
	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
	add,dc		t2,hi,cylimb
	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
	add		t4,rlimb,t3
	ldd		-104(%r30),hi		; hi = high 64 bit of product
	add,dc		%r0,cylimb,cylimb
	fldd		0(sptr),%fr4
	ldo		8(sptr),sptr
	std		t3,0(rptr)
	addib,<>	-1,%r24,L$loop
	ldo		8(rptr),rptr
L$end2
	xmpyu		%fr5R,%fr4R,%fr6
	fstd		%fr6,-128(%r30)
	xmpyu		%fr5R,%fr4L,%fr7
	fstd		%fr7,-120(%r30)
	xmpyu		%fr5L,%fr4R,%fr8
	fstd		%fr8,-112(%r30)
	xmpyu		%fr5L,%fr4L,%fr9
	fstd		%fr9,-104(%r30)
	ldd		0(rptr),rlimb
	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
	add,l		m0,t1,t1		; t1 += m0
	add,l,*nuv	m1,t1,t1		; t1 += m0
	 add,l		%r5,hi,hi		; propagate carry
	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
	depd,z		t1,31,32,t5		; t5 = lo32(t1)
	add,l		t5,t4,t4		; t4 += lo32(t1)
	ldd		-128(%r30),lo		; lo = low 64 bit of product
	add		cylimb,rlimb,rlimb
	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
	add,dc		t2,hi,cylimb
	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
	add		t4,rlimb,t3
	ldd		-104(%r30),hi		; hi = high 64 bit of product
	add,dc		%r0,cylimb,cylimb
	std		t3,0(rptr)
	ldo		8(rptr),rptr
L$end1
	ldd		0(rptr),rlimb
	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
	add,l		m0,t1,t1		; t1 += m0
	add,l,*nuv	m1,t1,t1		; t1 += m0
	 add,l		%r5,hi,hi		; propagate carry
	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
	depd,z		t1,31,32,t5		; t5 = lo32(t1)
	add,l		t5,t4,t4		; t4 += lo32(t1)
	add		cylimb,rlimb,rlimb
	add,dc		t2,hi,cylimb
	add		t4,rlimb,t3
	add,dc		%r0,cylimb,cylimb
	std		t3,0(rptr)
	ldo		8(rptr),rptr

	ldd		-96(%r30),%r3
	ldd		-88(%r30),%r4
	ldd		-80(%r30),%r5
	ldd		-72(%r30),%r6

	extrd,u		cylimb,31,32,%r28
	bve		(%r2)
	.exit
	ldo		-128(%r30),%r30
	.procend