summaryrefslogtreecommitdiff
path: root/mpn/x86/k7/invert_limb.asm
blob: 435fa96d0935b498dfd593ec24847593d1b4018e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
dnl  x86 mpn_invert_limb

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 3 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C			    cycles (approx)	div
C P5				 ?
C P6 model 0-8,10-12		 ?
C P6 model 9  (Banias)		 ?
C P6 model 13 (Dothan)		 ?
C P4 model 0  (Willamette)	 ?
C P4 model 1  (?)		 ?
C P4 model 2  (Northwood)	 ?
C P4 model 3  (Prescott)	 ?
C P4 model 4  (Nocona)		 ?
C AMD K6			 ?
C AMD K7			41		53
C AMD K8			 ?

C TODO
C  * These c/l numbers are for a non-PIC build.  Consider falling back to using
C    the 'div' instruction for PIC builds.
C  * Perhaps use this file--or at least the algorithm--for more machines than k7.

C Register usage:
C   Input D in %edi
C   Current approximation is in %eax and/or %ecx
C   %ebx and %edx are temporaries
C   %esi and %ebp are unused

defframe(PARAM_DIVISOR,4)

ASM_START()

C Make approx_tab global to work around Apple relocation bug.
ifdef(`DARWIN',`
	deflit(`approx_tab', MPN(invert_limb_tab))
	GLOBL	approx_tab')

	TEXT
	ALIGN(16)
PROLOGUE(mpn_invert_limb)
deflit(`FRAME', 0)
	mov	PARAM_DIVISOR, %eax
	C Avoid push/pop on k7.
	sub	$8, %esp	FRAME_subl_esp(8)
	mov	%ebx, (%esp)
	mov	%edi, 4(%esp)

	mov	%eax, %edi
	shr	$22, %eax
ifdef(`PIC',`
	LEA(	approx_tab, %ebx)
	movzwl	-1024(%ebx, %eax, 2), %eax
',`
	movzwl	-1024+approx_tab(%eax, %eax), %eax	C %eax = v0
')

	C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
	mov	%eax, %ecx
	imul	%eax, %eax
	mov	%edi, %ebx
	shr	$11, %ebx
	inc	%ebx
	mul	%ebx
	mov	%edi, %ebx				C Prepare
	shr	%ebx
	sbb	%eax, %eax
	sub	%eax, %ebx				C %ebx = d_31, %eax = mask
	shl	$4, %ecx
	dec	%ecx
	sub	%edx, %ecx				C %ecx = v1

	C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
	imul	%ecx, %ebx
	and	%ecx, %eax
	shr	%eax
	sub	%ebx, %eax
	mul	%ecx
	mov	%edi, %eax				C Prepare for next mul
	shl	$15, %ecx
	shr	%edx
	add	%edx, %ecx				C %ecx = v2

	mul	%ecx
	add	%edi, %eax
	mov	%ecx, %eax
	adc	%edi, %edx
	sub	%edx, %eax				C %eax = v3

	mov	(%esp), %ebx
	mov	4(%esp), %edi
	add	$8, %esp

	ret

EPILOGUE()

DEF_OBJECT(approx_tab,2)
	.value	0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
	.value	0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
	.value	0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
	.value	0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
	.value	0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
	.value	0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
	.value	0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
	.value	0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
	.value	0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
	.value	0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
	.value	0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
	.value	0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
	.value	0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
	.value	0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
	.value	0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
	.value	0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
	.value	0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
	.value	0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
	.value	0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
	.value	0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
	.value	0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
	.value	0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
	.value	0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
	.value	0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
	.value	0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
	.value	0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
	.value	0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
	.value	0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
	.value	0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
	.value	0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
	.value	0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
	.value	0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
	.value	0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
	.value	0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
	.value	0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
	.value	0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
	.value	0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
	.value	0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
	.value	0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
	.value	0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
	.value	0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
	.value	0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
	.value	0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
	.value	0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
	.value	0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
	.value	0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
	.value	0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
	.value	0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
	.value	0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
	.value	0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
	.value	0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
	.value	0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
	.value	0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
	.value	0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
	.value	0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
	.value	0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
	.value	0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
	.value	0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
	.value	0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
	.value	0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
	.value	0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
	.value	0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
	.value	0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
	.value	0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
END_OBJECT(approx_tab)