mpn/x86_64/mod_1_1.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

dnl  AMD64 mpn_mod_1_1p

dnl  Contributed to the GNU project by Torbjörn Granlund and Niels Möller.

dnl  Copyright 2009, 2010, 2011 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C	     cycles/limb
C AMD K8,K9	 6
C AMD K10	 6
C Intel P4	26
C Intel core2	12.5
C Intel NHM	11.3
C Intel SBR	 8.4	(slowdown, old code took 8.0)
C Intel atom	26
C VIA nano	13

define(`B2mb',   `%r10')
define(`B2modb', `%r11')
define(`ap',     `%rdi')
define(`n',      `%rsi')
define(`pre',    `%r8')
define(`b',      `%rbx')

define(`r0',     `%rbp') C r1 kept in %rax
define(`r2',	 `%rcx')  C kept negated. Also used as shift count
define(`t0',     `%r9')

C mp_limb_t
C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
C                       %rdi         %rsi         %rdx                %rcx
C The pre array contains bi, cnt, B1modb, B2modb
C Note: This implementation needs B1modb only when cnt > 0

C The iteration is almost as follows,
C
C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
C                                                
C where r2 is a single bit represented as a mask. But to make sure that the
C result fits in two limbs and a bit, carry from the addition
C
C   r_0 + r_2 B2mod
C
C is handled specially. On carry, we subtract b to cancel the carry,
C and we use instead the value
C
C   r_0 + B2mb (mod B)
C
C This addition can be issued early since it doesn't depend on r2, and it is
C the source of the cmov in the loop.
C
C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b

ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_mod_1_1p)
	push	%rbp
	push	%rbx
	mov	%rdx, b
	mov	%rcx, pre

	mov	-8(ap, n, 8), %rax
	cmp	$3, n
	jnc	L(first)
	mov	-16(ap, n, 8), r0
	jmp	L(reduce_two)

L(first):
	C First iteration, no r2
	mov	24(pre), B2modb
	mul	B2modb
	mov	-24(ap, n, 8), r0
	add	%rax, r0
	mov	-16(ap, n, 8), %rax
	adc	%rdx, %rax
	sbb	r2, r2
	sub	$4, n
	jc	L(reduce_three)

	mov	B2modb, B2mb
	sub	b, B2mb

	ALIGN(16)
L(top):	and	B2modb, r2
	lea	(B2mb, r0), t0
	mul	B2modb
	add	r0, r2
	mov	(ap, n, 8), r0
	cmovc	t0, r2
	add	%rax, r0
	mov	r2, %rax
	adc	%rdx, %rax
	sbb 	r2, r2
	sub 	$1, n
	jnc	L(top)

L(reduce_three):
	C Eliminate r2
	and	b, r2
	sub	r2, %rax

L(reduce_two):
	mov	8(pre), R32(%rcx)
	test	R32(%rcx), R32(%rcx)
	jz	L(normalized)

	C Unnormalized, use B1modb to reduce to size < B (b+1)
	mulq	16(pre)
	xor	t0, t0
	add	%rax, r0
	adc	%rdx, t0
	mov	t0, %rax

	C Left-shift to normalize
ifdef(`SHLD_SLOW',`
	shl	R8(%rcx), %rax
	mov	r0, t0
	neg	R32(%rcx)
	shr	R8(%rcx), t0
	or	t0, %rax
	neg	R32(%rcx)
',`
	shld	R8(%rcx), r0, %rax
')
	shl	R8(%rcx), r0
	jmp	L(udiv)

L(normalized):
	mov	%rax, t0
	sub	b, t0
	cmovnc	t0, %rax

L(udiv):
	lea	1(%rax), t0
	mulq	(pre)
	add	r0, %rax
	adc	t0, %rdx
	imul	b, %rdx
	sub	%rdx, r0
	cmp	r0, %rax
	lea	(b, r0), %rax
	cmovnc	r0, %rax
	cmp	b, %rax
	jnc	L(fix)
L(ok):	shr	R8(%rcx), %rax

	pop	%rbx
	pop	%rbp
	ret
L(fix):	sub	b, %rax
	jmp	L(ok)
EPILOGUE()

	ALIGN(16)
PROLOGUE(mpn_mod_1_1p_cps)
	push	%rbp
	bsr	%rsi, %rcx
	push	%rbx
	mov	%rdi, %rbx
	push	%r12
	xor	$63, R32(%rcx)
	mov	%rsi, %r12
	mov	R32(%rcx), R32(%rbp)
	sal	R8(%rcx), %r12
	mov	%r12, %rdi
	CALL(	mpn_invert_limb)
	neg	%r12
	mov	%r12, %r8
	mov	%rax, (%rbx)		C store bi
	mov	%rbp, 8(%rbx)		C store cnt
	imul	%rax, %r12
	mov	%r12, 24(%rbx)		C store B2modb
	mov	R32(%rbp), R32(%rcx)
	test	R32(%rcx), R32(%rcx)
	jz	L(z)

	mov	$1, R32(%rdx)
ifdef(`SHLD_SLOW',`
	C Destroys %rax, unlike shld. Otherwise, we could do B1modb
	C before B2modb, and get rid of the move %r12, %r8 above.

	shl	R8(%rcx), %rdx
	neg	R32(%rcx)
	shr	R8(%rcx), %rax
	or	%rax, %rdx
	neg	R32(%rcx)
',`
	shld	R8(%rcx), %rax, %rdx
')
	imul	%rdx, %r8
	shr	R8(%rcx), %r8
	mov	%r8, 16(%rbx)		C store B1modb	
L(z):
	pop	%r12
	pop	%rbx
	pop	%rbp
	ret
EPILOGUE()
ASM_END()