summaryrefslogtreecommitdiff
path: root/mpn/ia64
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2012-04-07 11:41:59 +0200
committerTorbjorn Granlund <tege@gmplib.org>2012-04-07 11:41:59 +0200
commitf3abdf0e19c358d63a06588139f3041269064b5f (patch)
treea8f52dc90137014fa8a4890310fa40011a42cbc9 /mpn/ia64
parent164507fe075ad6a4291580ef0869462d6c8ce63f (diff)
downloadgmp-f3abdf0e19c358d63a06588139f3041269064b5f.tar.gz
Rewrite inner loop to use ctz table.
Diffstat (limited to 'mpn/ia64')
-rw-r--r--mpn/ia64/gcd_1.asm136
1 files changed, 62 insertions, 74 deletions
diff --git a/mpn/ia64/gcd_1.asm b/mpn/ia64/gcd_1.asm
index 8c7907717..3a173dda0 100644
--- a/mpn/ia64/gcd_1.asm
+++ b/mpn/ia64/gcd_1.asm
@@ -1,8 +1,9 @@
dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
-dnl Contributed to the GNU project by Kevin Ryde.
+dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
+dnl Granlund.
-dnl Copyright 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+dnl Copyright 2002, 2003, 2004, 2005, 2012 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -23,8 +24,8 @@ include(`../config.m4')
C cycles/bitpair (1x1 gcd)
-C Itanium: 14 (approx)
-C Itanium 2: 6.3
+C Itanium: ?
+C Itanium 2: 5.8 (trimmable to 5.64 with huge ctz_table)
C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
@@ -47,29 +48,13 @@ C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
C stripping factors of 2 from abs(x-y). Those factors of two are
C determined from just y-x, without the abs(), since there's the same
C number of trailing zeros on n or -n in twos complement. That makes the
-C dependent chain
-C
-C cycles
-C 1 sub x-y and x-y-1
-C 3 andcm (x-y-1)&~(x-y)
-C 2 popcnt trailing zeros
-C 3 shr.u strip abs(x-y)
-C ---
-C 9
+C dependent chain 8 cycles deep.
C
C The selection of x-y versus y-x for abs(x-y), and the selection of the
-C minimum of x and y, is done in parallel with the above.
+C minimum of x and y, is done in parallel with the critical path.
C
C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
-C average, hence the final 6.3 cycles/bitpair.
-C
-C The loop is not as fast as one might hope, since there's extra latency
-C from andcm going across to the `multimedia' popcnt, and vice versa from
-C multimedia shr.u back to the integer sub.
-C
-C The loop branch is .sptk.clr since we usually expect a good number of
-C iterations, and the iterations are data dependent so it's unlikely past
-C results will predict anything much about the future.
+C average, hence the final 5.8 cycles/bitpair.
C
C Not done:
C
@@ -90,13 +75,10 @@ C only going down I0), perhaps it'd be possible to shift left instead,
C using add. That would mean keeping track of the lowest not-yet-zeroed
C bit, using some sort of mask.
C
-C Itanium-1:
-C
-C This code is not designed for itanium-1 and in fact doesn't run well on
-C that chip. The loop seems to be about 21 cycles, probably because we end
-C up with a 10 cycle replay for not forcibly scheduling the shr.u latency.
-C Lack of branch hints might introduce a couple of bubbles too.
-C
+C TODO:
+C * Once mod_1_N exists in assembly for Itanium, add conditional calls.
+C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
+C * Probably avoid popcnt also outside of loop, instead use ctz_table.
ASM_START()
.explicit C What does this mean?
@@ -105,6 +87,18 @@ C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
.global mpn_modexact_1c_odd
.type mpn_modexact_1c_odd,@function
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+ .section ".rodata"
+ctz_table:
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+
PROLOGUE(mpn_gcd_1)
C r32 xp
@@ -148,13 +142,9 @@ ifdef(`HAVE_ABI_32',
mov out_carry = 0
- C
-
popcnt y_twos = y_twos C I0 y twos
;;
- C
-
{ .mmi; add x_orig_one = -1, x_orig C M0 orig x-1
shr.u out_divisor = y, y_twos C I0 y without twos
}{ shr.u y = y, y_twos C I1 y without twos
@@ -171,63 +161,61 @@ ifdef(`HAVE_ABI_32',
mov b0 = save_rp C I0
} ;;
- C
-
popcnt x_orig = x_orig C I0 orig x twos
-
popcnt r9 = r9 C I0 x twos
;;
- C
-
{ cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos
shr.u x = x, r9 C I0 x odd
} ;;
{ (p7) mov y_twos = x_orig C M0 common twos
add r10 = -1, y C I0 y-1
- (p6) br.dpnt.few .Ldone_y C B0 x%y==0 then result y
+ (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y
} ;;
- C
-
-
- C No noticable difference in speed for the loop aligned to
- C 32 or just 16.
-.Ltop:
- C r8 x
- C r10 y-1
- C r34 y
- C r38 common twos, for use at end
-
-{ .mmi; cmp.gtu p8,p9 = x, y C M0 x>y
- cmp.ne p10,p0 = x, y C M1 x==y
- sub r9 = y, x C I0 d = y - x
-}{ .mmi; sub r10 = r10, x C M2 d-1 = y - x - 1
-} ;;
-
-{ .mmi; .pred.rel "mutex", p8, p9
- (p8) sub x = x, y C M0 x>y use x=x-y, y unchanged
- (p9) mov y = x C M1 y>=x use y=x
- (p9) mov x = r9 C I0 y>=x use x=y-x
-}{ .mmi; andcm r9 = r10, r9 C M2 (d-1)&~d
+ addl r22 = @ltoffx(ctz_table#), r1
;;
-
- add r10 = -1, y C M0 new y-1
- popcnt r9 = r9 C I0 twos on x-y
-} ;;
-
-{ shr.u x = x, r9 C I0 new x without twos
- (p10) br.sptk.few.clr .Ltop
-} ;;
-
+ ld8.mov r22 = [r22], ctz_table#
+ br L(ent)
+
+
+ ALIGN(32)
+L(top): .pred.rel "mutex", p6,p7
+.mmi; and r20 = MASK, r19
+ (p7) mov y = x
+ (p6) sub x = x, y
+.mmi; (p7) mov x = r19
+ nop 0
+ nop 0
+ ;;
+L(mid):
+.mmb; add r21 = r22, r20
+ cmp.eq p10,p0 = 0, r20
+ (p10) br.spnt.few.clr L(shift_alot)
+ ;;
+.mmi; ld1 r16 = [r21]
+ ;;
+ nop 0
+ shr.u x = x, r16
+ ;;
+L(ent):
+.mmi; sub r19 = y, x
+ cmp.gtu p6,p7 = x, y
+ cmp.ne p8,p0 = x, y
+.mmb; nop 0
+ nop 0
+ (p8) br.sptk.few.clr L(top)
C result is y
-.Ldone_y:
- shl r8 = y, y_twos C I common factors of 2
- ;;
+L(done_y):
mov ar.pfs = save_pfs C I0
+ shl r8 = y, y_twos C I common factors of 2
br.ret.sptk.many b0
+L(shift_alot):
+ extr.u r20 = x, MAXSHIFT, MAXSHIFT
+ shr.u x = x, MAXSHIFT
+ br L(mid)
EPILOGUE()