Rewrite inner loop to use ctz table.

author: Torbjorn Granlund <tege@gmplib.org> 2012-04-07 11:41:59 +0200
committer: Torbjorn Granlund <tege@gmplib.org> 2012-04-07 11:41:59 +0200
commit: f3abdf0e19c358d63a06588139f3041269064b5f (patch)
tree: a8f52dc90137014fa8a4890310fa40011a42cbc9 /mpn/ia64
parent: 164507fe075ad6a4291580ef0869462d6c8ce63f (diff)
download: gmp-f3abdf0e19c358d63a06588139f3041269064b5f.tar.gz
1 files changed, 62 insertions, 74 deletions
diff --git a/mpn/ia64/gcd_1.asm b/mpn/ia64/gcd_1.asm
index 8c7907717..3a173dda0 100644
--- a/mpn/ia64/gcd_1.asm
+++ b/mpn/ia64/gcd_1.asm
@@ -1,8 +1,9 @@
 dnl  Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
 
-dnl  Contributed to the GNU project by Kevin Ryde.
+dnl  Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
+dnl  Granlund.
 
-dnl  Copyright 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+dnl  Copyright 2002, 2003, 2004, 2005, 2012 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -23,8 +24,8 @@ include(`../config.m4')
 
 
 C           cycles/bitpair (1x1 gcd)
-C Itanium:      14 (approx)
-C Itanium 2:     6.3
+C Itanium:       ?
+C Itanium 2:     5.8  (trimmable to 5.64 with huge ctz_table)
 
 
 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
@@ -47,29 +48,13 @@ C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
 C stripping factors of 2 from abs(x-y).  Those factors of two are
 C determined from just y-x, without the abs(), since there's the same
 C number of trailing zeros on n or -n in twos complement.  That makes the
-C dependent chain
-C
-C	cycles
-C	  1    sub     x-y and x-y-1
-C	  3    andcm   (x-y-1)&~(x-y)
-C	  2    popcnt  trailing zeros
-C	  3    shr.u   strip abs(x-y)
-C	 ---
-C	  9
+C dependent chain 8 cycles deep.
 C
 C The selection of x-y versus y-x for abs(x-y), and the selection of the
-C minimum of x and y, is done in parallel with the above.
+C minimum of x and y, is done in parallel with the critical path.
 C
 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
-C average, hence the final 6.3 cycles/bitpair.
-C
-C The loop is not as fast as one might hope, since there's extra latency
-C from andcm going across to the `multimedia' popcnt, and vice versa from
-C multimedia shr.u back to the integer sub.
-C
-C The loop branch is .sptk.clr since we usually expect a good number of
-C iterations, and the iterations are data dependent so it's unlikely past
-C results will predict anything much about the future.
+C average, hence the final 5.8 cycles/bitpair.
 C
 C Not done:
 C
@@ -90,13 +75,10 @@ C only going down I0), perhaps it'd be possible to shift left instead,
 C using add.  That would mean keeping track of the lowest not-yet-zeroed
 C bit, using some sort of mask.
 C
-C Itanium-1:
-C
-C This code is not designed for itanium-1 and in fact doesn't run well on
-C that chip.  The loop seems to be about 21 cycles, probably because we end
-C up with a 10 cycle replay for not forcibly scheduling the shr.u latency.
-C Lack of branch hints might introduce a couple of bubbles too.
-C
+C TODO:
+C  * Once mod_1_N exists in assembly for Itanium, add conditional calls.
+C  * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
+C  * Probably avoid popcnt also outside of loop, instead use ctz_table.
 
 ASM_START()
 	.explicit				C What does this mean?
@@ -105,6 +87,18 @@ C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
 	.global	mpn_modexact_1c_odd
 	.type	mpn_modexact_1c_odd,@function
 
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+	.section	".rodata"
+ctz_table:
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+
 PROLOGUE(mpn_gcd_1)
 
 		C r32	xp
@@ -148,13 +142,9 @@ ifdef(`HAVE_ABI_32',
 
 		mov	out_carry = 0
 
-		C
-
 		popcnt	y_twos = y_twos		C I0  y twos
 		;;
 
-		C
-
 { .mmi;		add	x_orig_one = -1, x_orig	C M0  orig x-1
 		shr.u	out_divisor = y, y_twos	C I0  y without twos
 }{		shr.u	y = y, y_twos		C I1  y without twos
@@ -171,63 +161,61 @@ ifdef(`HAVE_ABI_32',
 		mov	b0 = save_rp		C I0
 }		;;
 
-		C
-
 		popcnt	x_orig = x_orig		C I0  orig x twos
-
 		popcnt	r9 = r9			C I0  x twos
 		;;
 
-		C
-
 {		cmp.lt	p7,p0 = x_orig, y_twos	C M0  orig x_twos < y_twos
 		shr.u	x = x, r9		C I0  x odd
 }		;;
 
 {	(p7)	mov	y_twos = x_orig		C M0  common twos
 		add	r10 = -1, y		C I0  y-1
-	(p6)	br.dpnt.few .Ldone_y		C B0  x%y==0 then result y
+	(p6)	br.dpnt.few L(done_y)		C B0  x%y==0 then result y
 }		;;
 
-		C
-
-
-		C No noticable difference in speed for the loop aligned to
-		C 32 or just 16.
-.Ltop:
-		C r8	x
-		C r10  y-1
-		C r34	y
-		C r38	common twos, for use at end
-
-{  .mmi;	cmp.gtu	p8,p9 = x, y	C M0  x>y
-		cmp.ne	p10,p0 = x, y	C M1  x==y
-		sub	r9 = y, x	C I0  d = y - x
-}{ .mmi;	sub	r10 = r10, x	C M2  d-1 = y - x - 1
-}		;;
-
-{ .mmi;	.pred.rel "mutex", p8, p9
-	(p8)	sub	x = x, y	C M0  x>y  use x=x-y, y unchanged
-	(p9)	mov	y = x		C M1  y>=x use y=x
-	(p9)	mov	x = r9		C I0  y>=x use x=y-x
-}{ .mmi;	andcm	r9 = r10, r9	C M2  (d-1)&~d
+		addl	r22 = @ltoffx(ctz_table#), r1
 		;;
-
-		add	r10 = -1, y	C M0  new y-1
-		popcnt	r9 = r9		C I0  twos on x-y
-}		;;
-
-{		shr.u	x = x, r9	C I0   new x without twos
-	(p10)	br.sptk.few.clr .Ltop
-}		;;
-
+		ld8.mov r22 = [r22], ctz_table#
+		br	L(ent)
+
+
+		ALIGN(32)
+L(top):		.pred.rel "mutex", p6,p7
+.mmi;		and	r20 = MASK, r19
+	(p7)	mov	y = x
+	(p6)	sub	x = x, y
+.mmi;	(p7)	mov	x = r19
+		nop	0
+		nop	0
+		;;
+L(mid):
+.mmb;		add	r21 = r22, r20
+		cmp.eq	p10,p0 = 0, r20
+	(p10)	br.spnt.few.clr	 L(shift_alot)
+		;;
+.mmi;		ld1	r16 = [r21]
+		;;
+		nop	0
+		shr.u	x = x, r16
+		;;
+L(ent):
+.mmi;		sub	r19 = y, x
+		cmp.gtu	p6,p7 = x, y
+		cmp.ne	p8,p0 = x, y
+.mmb;		nop	0
+		nop	0
+	(p8)	br.sptk.few.clr L(top)
 
 
 		C result is y
-.Ldone_y:
-		shl	r8 = y, y_twos		C I   common factors of 2
-		;;
+L(done_y):
 		mov	ar.pfs = save_pfs	C I0
+		shl	r8 = y, y_twos		C I   common factors of 2
 		br.ret.sptk.many b0
 
+L(shift_alot):
+		extr.u	r20 = x, MAXSHIFT, MAXSHIFT
+		shr.u	x = x, MAXSHIFT
+		br	L(mid)
 EPILOGUE()
author	Torbjorn Granlund <tege@gmplib.org>	2012-04-07 11:41:59 +0200
committer	Torbjorn Granlund <tege@gmplib.org>	2012-04-07 11:41:59 +0200
commit	f3abdf0e19c358d63a06588139f3041269064b5f (patch)
tree	a8f52dc90137014fa8a4890310fa40011a42cbc9 /mpn/ia64
parent	164507fe075ad6a4291580ef0869462d6c8ce63f (diff)
download	gmp-f3abdf0e19c358d63a06588139f3041269064b5f.tar.gz