diff options
author | Torbjorn Granlund <tege@gmplib.org> | 2012-04-07 11:41:59 +0200 |
---|---|---|
committer | Torbjorn Granlund <tege@gmplib.org> | 2012-04-07 11:41:59 +0200 |
commit | f3abdf0e19c358d63a06588139f3041269064b5f (patch) | |
tree | a8f52dc90137014fa8a4890310fa40011a42cbc9 /mpn/ia64 | |
parent | 164507fe075ad6a4291580ef0869462d6c8ce63f (diff) | |
download | gmp-f3abdf0e19c358d63a06588139f3041269064b5f.tar.gz |
Rewrite inner loop to use ctz table.
Diffstat (limited to 'mpn/ia64')
-rw-r--r-- | mpn/ia64/gcd_1.asm | 136 |
1 files changed, 62 insertions, 74 deletions
diff --git a/mpn/ia64/gcd_1.asm b/mpn/ia64/gcd_1.asm index 8c7907717..3a173dda0 100644 --- a/mpn/ia64/gcd_1.asm +++ b/mpn/ia64/gcd_1.asm @@ -1,8 +1,9 @@ dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd. -dnl Contributed to the GNU project by Kevin Ryde. +dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn +dnl Granlund. -dnl Copyright 2002, 2003, 2004, 2005 Free Software Foundation, Inc. +dnl Copyright 2002, 2003, 2004, 2005, 2012 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -23,8 +24,8 @@ include(`../config.m4') C cycles/bitpair (1x1 gcd) -C Itanium: 14 (approx) -C Itanium 2: 6.3 +C Itanium: ? +C Itanium 2: 5.8 (trimmable to 5.64 with huge ctz_table) C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); @@ -47,29 +48,13 @@ C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then C stripping factors of 2 from abs(x-y). Those factors of two are C determined from just y-x, without the abs(), since there's the same C number of trailing zeros on n or -n in twos complement. That makes the -C dependent chain -C -C cycles -C 1 sub x-y and x-y-1 -C 3 andcm (x-y-1)&~(x-y) -C 2 popcnt trailing zeros -C 3 shr.u strip abs(x-y) -C --- -C 9 +C dependent chain 8 cycles deep. C C The selection of x-y versus y-x for abs(x-y), and the selection of the -C minimum of x and y, is done in parallel with the above. +C minimum of x and y, is done in parallel with the critical path. C C The algorithm takes about 0.68 iterations per bit (two N bit operands) on -C average, hence the final 6.3 cycles/bitpair. -C -C The loop is not as fast as one might hope, since there's extra latency -C from andcm going across to the `multimedia' popcnt, and vice versa from -C multimedia shr.u back to the integer sub. -C -C The loop branch is .sptk.clr since we usually expect a good number of -C iterations, and the iterations are data dependent so it's unlikely past -C results will predict anything much about the future. +C average, hence the final 5.8 cycles/bitpair. C C Not done: C @@ -90,13 +75,10 @@ C only going down I0), perhaps it'd be possible to shift left instead, C using add. That would mean keeping track of the lowest not-yet-zeroed C bit, using some sort of mask. C -C Itanium-1: -C -C This code is not designed for itanium-1 and in fact doesn't run well on -C that chip. The loop seems to be about 21 cycles, probably because we end -C up with a 10 cycle replay for not forcibly scheduling the shr.u latency. -C Lack of branch hints might introduce a couple of bubbles too. -C +C TODO: +C * Once mod_1_N exists in assembly for Itanium, add conditional calls. +C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls). +C * Probably avoid popcnt also outside of loop, instead use ctz_table. ASM_START() .explicit C What does this mean? @@ -105,6 +87,18 @@ C HP's assembler requires these declarations for importing mpn_modexact_1c_odd .global mpn_modexact_1c_odd .type mpn_modexact_1c_odd,@function +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + .section ".rodata" +ctz_table: + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') + PROLOGUE(mpn_gcd_1) C r32 xp @@ -148,13 +142,9 @@ ifdef(`HAVE_ABI_32', mov out_carry = 0 - C - popcnt y_twos = y_twos C I0 y twos ;; - C - { .mmi; add x_orig_one = -1, x_orig C M0 orig x-1 shr.u out_divisor = y, y_twos C I0 y without twos }{ shr.u y = y, y_twos C I1 y without twos @@ -171,63 +161,61 @@ ifdef(`HAVE_ABI_32', mov b0 = save_rp C I0 } ;; - C - popcnt x_orig = x_orig C I0 orig x twos - popcnt r9 = r9 C I0 x twos ;; - C - { cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos shr.u x = x, r9 C I0 x odd } ;; { (p7) mov y_twos = x_orig C M0 common twos add r10 = -1, y C I0 y-1 - (p6) br.dpnt.few .Ldone_y C B0 x%y==0 then result y + (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y } ;; - C - - - C No noticable difference in speed for the loop aligned to - C 32 or just 16. -.Ltop: - C r8 x - C r10 y-1 - C r34 y - C r38 common twos, for use at end - -{ .mmi; cmp.gtu p8,p9 = x, y C M0 x>y - cmp.ne p10,p0 = x, y C M1 x==y - sub r9 = y, x C I0 d = y - x -}{ .mmi; sub r10 = r10, x C M2 d-1 = y - x - 1 -} ;; - -{ .mmi; .pred.rel "mutex", p8, p9 - (p8) sub x = x, y C M0 x>y use x=x-y, y unchanged - (p9) mov y = x C M1 y>=x use y=x - (p9) mov x = r9 C I0 y>=x use x=y-x -}{ .mmi; andcm r9 = r10, r9 C M2 (d-1)&~d + addl r22 = @ltoffx(ctz_table#), r1 ;; - - add r10 = -1, y C M0 new y-1 - popcnt r9 = r9 C I0 twos on x-y -} ;; - -{ shr.u x = x, r9 C I0 new x without twos - (p10) br.sptk.few.clr .Ltop -} ;; - + ld8.mov r22 = [r22], ctz_table# + br L(ent) + + + ALIGN(32) +L(top): .pred.rel "mutex", p6,p7 +.mmi; and r20 = MASK, r19 + (p7) mov y = x + (p6) sub x = x, y +.mmi; (p7) mov x = r19 + nop 0 + nop 0 + ;; +L(mid): +.mmb; add r21 = r22, r20 + cmp.eq p10,p0 = 0, r20 + (p10) br.spnt.few.clr L(shift_alot) + ;; +.mmi; ld1 r16 = [r21] + ;; + nop 0 + shr.u x = x, r16 + ;; +L(ent): +.mmi; sub r19 = y, x + cmp.gtu p6,p7 = x, y + cmp.ne p8,p0 = x, y +.mmb; nop 0 + nop 0 + (p8) br.sptk.few.clr L(top) C result is y -.Ldone_y: - shl r8 = y, y_twos C I common factors of 2 - ;; +L(done_y): mov ar.pfs = save_pfs C I0 + shl r8 = y, y_twos C I common factors of 2 br.ret.sptk.many b0 +L(shift_alot): + extr.u r20 = x, MAXSHIFT, MAXSHIFT + shr.u x = x, MAXSHIFT + br L(mid) EPILOGUE() |