diff options
author | pelissip <pelissip@280ebfd0-de03-0410-8827-d642c229c3f4> | 2004-12-08 14:08:45 +0000 |
---|---|---|
committer | pelissip <pelissip@280ebfd0-de03-0410-8827-d642c229c3f4> | 2004-12-08 14:08:45 +0000 |
commit | a824f7ad7bc1fe409778f39947af2914f36f0cb6 (patch) | |
tree | 3bf05a53776f526ca2f80a6380956ed49db09325 /mul.c | |
parent | ba5d77b6a9e5019e745dfaf8d5d4b6265449ac69 (diff) | |
download | mpfr-a824f7ad7bc1fe409778f39947af2914f36f0cb6.tar.gz |
Optimize mpfr_mul by inlining and rewriting the rounding.
It seems that GCC option `-frename-registers` for mpfr_mul / Athlon XP
improves its performance (But it decreases it on Pentium4)...
git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@3120 280ebfd0-de03-0410-8827-d642c229c3f4
Diffstat (limited to 'mul.c')
-rw-r--r-- | mul.c | 134 |
1 files changed, 61 insertions, 73 deletions
@@ -24,101 +24,97 @@ MA 02111-1307, USA. */ int mpfr_mul (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t rnd_mode) { - int sign_product, cc, inexact; - mp_exp_t ax; + int sign, inexact; + mp_exp_t ax, ax2; mp_limb_t *tmp; mp_limb_t b1; mp_prec_t bq, cq; mp_size_t bn, cn, tn, k; - TMP_DECL(marker); + TMP_DECL (marker); /* deal with special cases */ - if (MPFR_ARE_SINGULAR(b,c)) + if (MPFR_ARE_SINGULAR (b, c)) { - if (MPFR_IS_NAN(b) || MPFR_IS_NAN(c)) + if (MPFR_IS_NAN (b) || MPFR_IS_NAN (c)) { - MPFR_SET_NAN(a); + MPFR_SET_NAN (a); MPFR_RET_NAN; } - sign_product = MPFR_MULT_SIGN( MPFR_SIGN(b) , MPFR_SIGN(c) ); - if (MPFR_IS_INF(b)) + sign = MPFR_MULT_SIGN (MPFR_SIGN (b), MPFR_SIGN (c)); + if (MPFR_IS_INF (b)) { - if (MPFR_IS_INF(c) || MPFR_NOTZERO(c)) + if (!MPFR_IS_ZERO (c)) { - MPFR_SET_SIGN(a,sign_product); - MPFR_SET_INF(a); - MPFR_RET(0); /* exact */ + MPFR_SET_SIGN (a, sign); + MPFR_SET_INF (a); + MPFR_RET (0); } else { - MPFR_SET_NAN(a); + MPFR_SET_NAN (a); MPFR_RET_NAN; } } - else if (MPFR_IS_INF(c)) + else if (MPFR_IS_INF (c)) { - if (MPFR_NOTZERO(b)) + if (!MPFR_IS_ZERO (b)) { - MPFR_SET_SIGN(a, sign_product); - MPFR_SET_INF(a); - MPFR_RET(0); /* exact */ + MPFR_SET_SIGN (a, sign); + MPFR_SET_INF (a); + MPFR_RET(0); } else { - MPFR_SET_NAN(a); + MPFR_SET_NAN (a); MPFR_RET_NAN; } } else { - MPFR_ASSERTD(MPFR_IS_ZERO(b) || MPFR_IS_ZERO(c)); - MPFR_SET_SIGN(a, sign_product); - MPFR_SET_ZERO(a); - MPFR_RET(0); /* 0 * 0 is exact */ + MPFR_ASSERTD (MPFR_IS_ZERO(b) || MPFR_IS_ZERO(c)); + MPFR_SET_SIGN (a, sign); + MPFR_SET_ZERO (a); + MPFR_RET (0); } } - MPFR_CLEAR_FLAGS(a); - sign_product = MPFR_MULT_SIGN( MPFR_SIGN(b) , MPFR_SIGN(c) ); + MPFR_CLEAR_FLAGS (a); + sign = MPFR_MULT_SIGN (MPFR_SIGN (b), MPFR_SIGN (c)); ax = MPFR_GET_EXP (b) + MPFR_GET_EXP (c); /* Note: the exponent of the exact result will be e = bx + cx + ec with ec in {-1,0,1} and the following assumes that e is representable. */ - /* These ASSERT should be always true */ - MPFR_ASSERTN(MPFR_EMAX_MAX <= (MPFR_EXP_MAX >> 1)); - MPFR_ASSERTN(MPFR_EMIN_MIN >= -(MPFR_EXP_MAX >> 1)); /* FIXME: Usefull since we do an exponent check after ? * It is usefull iff the precision is big, there is an overflow * and we are doing further mults...*/ #ifdef HUGE - if (MPFR_UNLIKELY(ax > __gmpfr_emax + 1)) - return mpfr_set_overflow (a, rnd_mode, sign_product); - if (MPFR_UNLIKELY(ax < __gmpfr_emin - 2)) + if (MPFR_UNLIKELY (ax > __gmpfr_emax + 1)) + return mpfr_set_overflow (a, rnd_mode, sign); + if (MPFR_UNLIKELY (ax < __gmpfr_emin - 2)) return mpfr_set_underflow (a, rnd_mode == GMP_RNDN ? GMP_RNDZ : rnd_mode, - sign_product); + sign); #endif - bq = MPFR_PREC(b); - cq = MPFR_PREC(c); + bq = MPFR_PREC (b); + cq = MPFR_PREC (c); - MPFR_ASSERTD(bq+cq > bq); /* PREC_MAX is /2 so no integer overflow */ + MPFR_ASSERTD (bq+cq > bq); /* PREC_MAX is /2 so no integer overflow */ bn = (bq+BITS_PER_MP_LIMB-1)/BITS_PER_MP_LIMB; /* number of limbs of b */ cn = (cq+BITS_PER_MP_LIMB-1)/BITS_PER_MP_LIMB; /* number of limbs of c */ k = bn + cn; /* effective nb of limbs used by b*c (= tn or tn+1) below */ tn = (bq + cq + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; - /* <= k, thus no int overflow */ - MPFR_ASSERTD(tn <= k); + MPFR_ASSERTD (tn <= k); /* tn <= k, thus no int overflow */ /* Check for no size_t overflow*/ - MPFR_ASSERTD((size_t) k <= ((size_t) ~0) / BYTES_PER_MP_LIMB); - TMP_MARK(marker); - tmp = (mp_limb_t *) TMP_ALLOC((size_t) k * BYTES_PER_MP_LIMB); + MPFR_ASSERTD ((size_t) k <= ((size_t) ~0) / BYTES_PER_MP_LIMB); + TMP_MARK (marker); + tmp = (mp_limb_t *) TMP_ALLOC ((size_t) k * BYTES_PER_MP_LIMB); /* multiplies two mantissa in temporary allocated space */ - b1 = (MPFR_LIKELY(bn >= cn)) ? - mpn_mul (tmp, MPFR_MANT(b), bn, MPFR_MANT(c), cn) - : mpn_mul (tmp, MPFR_MANT(c), cn, MPFR_MANT(b), bn); + b1 = MPFR_LIKELY (bn >= cn) + ? mpn_mul (tmp, MPFR_MANT (b), bn, MPFR_MANT (c), cn) + : mpn_mul (tmp, MPFR_MANT (c), cn, MPFR_MANT (b), bn); /* now tmp[0]..tmp[k-1] contains the product of both mantissa, with tmp[k-1]>=2^(BITS_PER_MP_LIMB-2) */ @@ -128,36 +124,28 @@ mpfr_mul (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t rnd_mode) then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386 and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */ tmp += k - tn; - if (MPFR_UNLIKELY(b1 == 0)) + if (MPFR_UNLIKELY (b1 == 0)) mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */ - cc = mpfr_round_raw (MPFR_MANT (a), tmp, bq + cq, - MPFR_IS_NEG_SIGN(sign_product), - MPFR_PREC (a), rnd_mode, &inexact); - - /* cc = 1 ==> result is a power of two */ - if (MPFR_UNLIKELY(cc)) - MPFR_MANT(a)[MPFR_LIMB_SIZE(a)-1] = MPFR_LIMB_HIGHBIT; - - TMP_FREE(marker); - - { - mp_exp_t ax2 = ax + (mp_exp_t) (b1 - 1 + cc); - if (MPFR_UNLIKELY( ax2 > __gmpfr_emax)) - return mpfr_set_overflow (a, rnd_mode, sign_product); - if (MPFR_UNLIKELY( ax2 < __gmpfr_emin)) - { - /* In the rounding to the nearest mode, if the exponent of the exact - result (i.e. before rounding, i.e. without taking cc into account) - is < __gmpfr_emin - 1 or the exact result is a power of 2 (i.e. if - both arguments are powers of 2), then round to zero. */ - if (rnd_mode == GMP_RNDN && - (ax + (mp_exp_t) b1 < __gmpfr_emin || - (mpfr_powerof2_raw (b) && mpfr_powerof2_raw (c)))) - rnd_mode = GMP_RNDZ; - return mpfr_set_underflow (a, rnd_mode, sign_product); - } - MPFR_SET_EXP (a, ax2); - MPFR_SET_SIGN(a, sign_product); - } + + ax2 = ax + (mp_exp_t) (b1 - 1); + MPFR_RNDRAW (inexact, a, tmp, bq+cq, rnd_mode, sign, ax2++); + TMP_FREE (marker); + MPFR_EXP (a) = ax2; /* Can't use MPFR_SET_EXP: Exponent may be out of range */ + MPFR_SET_SIGN (a, sign); + if (MPFR_UNLIKELY (ax2 > __gmpfr_emax)) + return mpfr_set_overflow (a, rnd_mode, sign); + if (MPFR_UNLIKELY (ax2 < __gmpfr_emin)) + { + /* In the rounding to the nearest mode, if the exponent of the exact + result (i.e. before rounding, i.e. without taking cc into account) + is < __gmpfr_emin - 1 or the exact result is a power of 2 (i.e. if + both arguments are powers of 2), then round to zero. */ + if (rnd_mode == GMP_RNDN + && (ax + (mp_exp_t) b1 < __gmpfr_emin + || (mpfr_powerof2_raw (b) && mpfr_powerof2_raw (c)))) + rnd_mode = GMP_RNDZ; + return mpfr_set_underflow (a, rnd_mode, sign); + } return inexact; } + |