diff options
author | Kevin Ryde <user42@zip.com.au> | 2002-07-10 03:08:09 +0200 |
---|---|---|
committer | Kevin Ryde <user42@zip.com.au> | 2002-07-10 03:08:09 +0200 |
commit | 3e246811d9d8e402a80e82ff20a71bc8d756ff8b (patch) | |
tree | d9579a90dc2319b78f0cc69e6bb24b076c6dba6b /longlong.h | |
parent | f0201b28f82f399a6d5a6bedbf709c50cf12f067 (diff) | |
download | gmp-3e246811d9d8e402a80e82ff20a71bc8d756ff8b.tar.gz |
* longlong.h (count_leading_zeros) [pentiummmx]: Don't use __clz_tab
variant under LONGLONG_STANDALONE.
Diffstat (limited to 'longlong.h')
-rw-r--r-- | longlong.h | 68 |
1 files changed, 32 insertions, 36 deletions
diff --git a/longlong.h b/longlong.h index 4ec1636e5..8e4e1c487 100644 --- a/longlong.h +++ b/longlong.h @@ -547,40 +547,18 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); : "=a" (q), "=d" (r) \ : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) -/* P5 bsrl takes between 10 and 72 cycles depending where the most - significant 1 bit is, hence the use of the alternatives below. bsfl is - slow too, between 18 and 42 depending where the least significant 1 bit - is. The faster count_leading_zeros are pressed into service via the - generic count_trailing_zeros at the end of the file. */ - -#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium - -/* The following should be a fixed 14 cycles or so. Some scheduling - opportunities should be available between the float load/store too. This - is used (with "n&-n" to get trailing zeros) in gcc 3 for __builtin_ffs - and is apparently suggested by the Intel optimizing manual (don't know - exactly where). gcc 2.95 or up will be best for this, so the "double" is - correctly aligned on the stack. */ - -#define count_leading_zeros(c,n) \ - do { \ - union { \ - double d; \ - unsigned a[2]; \ - } __u; \ - ASSERT ((n) != 0); \ - __u.d = (UWtype) (n); \ - (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ - } while (0) -#define COUNT_LEADING_ZEROS_0 (0x3FF + 31) - -#else /* ! pentium */ -#if HAVE_HOST_CPU_pentiummmx - +#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx +/* Pentium bsrl takes between 10 and 72 cycles depending where the most + significant 1 bit is, hence the use of the following alternatives. bsfl + is slow too, between 18 and 42 depending where the least significant 1 + bit is, so let the generic count_trailing_zeros below make use of the + count_leading_zeros here too. */ + +#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 - cache miss reading from __clz_tab. It's favoured over the float above so - as to avoid mixing MMX and x87, since the penalty for switching between - the two is about 100 cycles. + cache miss reading from __clz_tab. For P55 it's favoured over the float + below so as to avoid mixing MMX and x87, since the penalty for switching + between the two is about 100 cycles. The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 16, -1 for 8, or 0 otherwise. This could be written equivalently as @@ -609,11 +587,30 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); __shift = __shift*8 + 24 + 1; \ (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ } while (0) - #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ -#else /* !pentiummmx */ +#else /* ! pentiummmx || LONGLONG_STANDALONE */ +/* The following should be a fixed 14 cycles or so. Some scheduling + opportunities should be available between the float load/store too. This + sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is + apparently suggested by the Intel optimizing manual (don't know exactly + where). gcc 2.95 or up will be best for this, so the "double" is + correctly aligned on the stack. */ +#define count_leading_zeros(c,n) \ + do { \ + union { \ + double d; \ + unsigned a[2]; \ + } __u; \ + ASSERT ((n) != 0); \ + __u.d = (UWtype) (n); \ + (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ + } while (0) +#define COUNT_LEADING_ZEROS_0 (0x3FF + 31) +#endif /* pentiummx */ + +#else /* ! pentium */ /* On P6, gcc prior to 3.0 generates a partial register stall for __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the @@ -647,7 +644,6 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); ASSERT ((x) != 0); \ __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \ } while (0) -#endif /* ! pentiummmx */ #endif /* ! pentium */ #ifndef UMUL_TIME |