diff options
Diffstat (limited to 'rts/gmp/mpn/generic')
43 files changed, 8044 insertions, 0 deletions
diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c new file mode 100644 index 0000000000..5fcb7e4835 --- /dev/null +++ b/rts/gmp/mpn/generic/add_n.c @@ -0,0 +1,62 @@ +/* mpn_add_n -- Add two limb vectors of equal, non-zero length. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +#if __STDC__ +mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size) +#else +mpn_add_n (res_ptr, s1_ptr, s2_ptr, size) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + register mp_srcptr s2_ptr; + mp_size_t size; +#endif +{ + register mp_limb_t x, y, cy; + register mp_size_t j; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + s2_ptr -= j; + res_ptr -= j; + + cy = 0; + do + { + y = s2_ptr[j]; + x = s1_ptr[j]; + y += cy; /* add previous carry to one addend */ + cy = (y < cy); /* get out carry from that addition */ + y = x + y; /* add other addend */ + cy = (y < x) + cy; /* get out carry from that add, combine */ + res_ptr[j] = y; + } + while (++j != 0); + + return cy; +} diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c new file mode 100644 index 0000000000..746ae31307 --- /dev/null +++ b/rts/gmp/mpn/generic/addmul_1.c @@ -0,0 +1,65 @@ +/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR + by S2_LIMB, add the S1_SIZE least significant limbs of the product to the + limb vector pointed to by RES_PTR. Return the most significant limb of + the product, adjusted for carry-out from the addition. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + mp_size_t s1_size; + register mp_limb_t s2_limb; +{ + register mp_limb_t cy_limb; + register mp_size_t j; + register mp_limb_t prod_high, prod_low; + register mp_limb_t x; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + res_ptr -= j; + s1_ptr -= j; + + cy_limb = 0; + do + { + umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb) + prod_high; + + x = res_ptr[j]; + prod_low = x + prod_low; + cy_limb += (prod_low < x); + res_ptr[j] = prod_low; + } + while (++j != 0); + + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c new file mode 100644 index 0000000000..c9bab3ef60 --- /dev/null +++ b/rts/gmp/mpn/generic/addsub_n.c @@ -0,0 +1,167 @@ +/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#ifndef L1_CACHE_SIZE +#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */ +#endif + +#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6) + + +/* mpn_addsub_n. + r1[] = s1[] + s2[] + r2[] = s1[] - s2[] + All operands have n limbs. + In-place operations allowed. */ +mp_limb_t +#if __STDC__ +mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n) +#else +mpn_addsub_n (r1p, r2p, s1p, s2p, n) + mp_ptr r1p, r2p; + mp_srcptr s1p, s2p; + mp_size_t n; +#endif +{ + mp_limb_t acyn, acyo; /* carry for add */ + mp_limb_t scyn, scyo; /* carry for subtract */ + mp_size_t off; /* offset in operands */ + mp_size_t this_n; /* size of current chunk */ + + /* We alternatingly add and subtract in chunks that fit into the (L1) + cache. Since the chunks are several hundred limbs, the function call + overhead is insignificant, but we get much better locality. */ + + /* We have three variant of the inner loop, the proper loop is chosen + depending on whether r1 or r2 are the same operand as s1 or s2. */ + + if (r1p != s1p && r1p != s2p) + { + /* r1 is not identical to either input operand. We can therefore write + to r1 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + } + } + else if (r2p != s1p && r2p != s2p) + { + /* r2 is not identical to either input operand. We can therefore write + to r2 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif + } + } + else + { + /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa) + Need temporary storage. */ + mp_limb_t tp[PART_SIZE]; + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + MPN_COPY (r1p + off, tp, this_n); + } + } + + return 2 * acyo + scyo; +} + +#ifdef MAIN +#include <stdlib.h> +#include <stdio.h> +#include "timing.h" + +long cputime (); + +int +main (int argc, char **argv) +{ + mp_ptr r1p, r2p, s1p, s2p; + double t; + mp_size_t n; + + n = strtol (argv[1], 0, 0); + + r1p = malloc (n * BYTES_PER_MP_LIMB); + r2p = malloc (n * BYTES_PER_MP_LIMB); + s1p = malloc (n * BYTES_PER_MP_LIMB); + s2p = malloc (n * BYTES_PER_MP_LIMB); + TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n))); + printf (" separate add and sub: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n)); + printf ("combined addsub separate variables: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r1 overlap: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r2 overlap: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n)); + printf (" combined addsub in-place: %.3f\n", t); + + return 0; +} +#endif diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c new file mode 100644 index 0000000000..c4bcb414e6 --- /dev/null +++ b/rts/gmp/mpn/generic/bdivmod.c @@ -0,0 +1,120 @@ +/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d. + +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d). + + Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and + returns the high d%BITS_PER_MP_LIMB bits of Q as the result. + + Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up. Since the + low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows + the limb vectors at qp to overwrite the low limbs at up, provided qp <= up. + + Preconditions: + 1. V is odd. + 2. usize * BITS_PER_MP_LIMB >= d. + 3. If Q and U overlap, qp <= up. + + Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu) + + Funding for this work has been partially provided by Conselho Nacional + de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant + 301314194-2, and was done while I was a visiting reseacher in the Instituto + de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS). + + References: + T. Jebelean, An algorithm for exact division, Journal of Symbolic + Computation, v. 15, 1993, pp. 169-180. + + K. Weber, The accelerated integer GCD algorithm, ACM Transactions on + Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +#if __STDC__ +mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize, + mp_srcptr vp, mp_size_t vsize, unsigned long int d) +#else +mpn_bdivmod (qp, up, usize, vp, vsize, d) + mp_ptr qp; + mp_ptr up; + mp_size_t usize; + mp_srcptr vp; + mp_size_t vsize; + unsigned long int d; +#endif +{ + mp_limb_t v_inv; + + /* 1/V mod 2^BITS_PER_MP_LIMB. */ + modlimb_invert (v_inv, vp[0]); + + /* Fast code for two cases previously used by the accel part of mpn_gcd. + (Could probably remove this now it's inlined there.) */ + if (usize == 2 && vsize == 2 && + (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB)) + { + mp_limb_t hi, lo; + mp_limb_t q = up[0] * v_inv; + umul_ppmm (hi, lo, q, vp[0]); + up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q; + if (d == 2*BITS_PER_MP_LIMB) + q = up[1] * v_inv, up[1] = 0, qp[1] = q; + return 0; + } + + /* Main loop. */ + while (d >= BITS_PER_MP_LIMB) + { + mp_limb_t q = up[0] * v_inv; + mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); + if (usize > vsize) + mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); + d -= BITS_PER_MP_LIMB; + up += 1, usize -= 1; + *qp++ = q; + } + + if (d) + { + mp_limb_t b; + mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1); + if (q <= 1) + { + if (q == 0) + return 0; + else + b = mpn_sub_n (up, up, vp, MIN (usize, vsize)); + } + else + b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); + + if (usize > vsize) + mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); + return q; + } + + return 0; +} diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c new file mode 100644 index 0000000000..d234b22af5 --- /dev/null +++ b/rts/gmp/mpn/generic/bz_divrem_n.c @@ -0,0 +1,153 @@ +/* mpn_bz_divrem_n and auxilliary routines. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + + +Copyright (C) 2000 Free Software Foundation, Inc. +Contributed by Paul Zimmermann. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* +[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler, + Technical report MPI-I-98-1-022, october 1998. + http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz +*/ + +static mp_limb_t mpn_bz_div_3_halves_by_2 + _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)); + + +/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than + div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way, + i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */ +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD) +#endif + +#if 0 +static +unused_mpn_divrem (qp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +{ + /* This might be useful: */ + if (qxn != 0) + { + mp_limb_t c; + mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB); + MPN_COPY (tp + qxn - nn, np, nn); + MPN_ZERO (tp, qxn); + c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn); + /* Maybe copy proper part of tp to np? Documentation is unclear about + the returned np value when qxn != 0 */ + return c; + } +} +#endif + + +/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n) + by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n). + Returns most significant limb of the quotient, which is 0 or 1. + Requires that the most significant bit of the divisor is set. */ + +mp_limb_t +#if __STDC__ +mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n) +#else +mpn_bz_divrem_n (qp, np, dp, n) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; +#endif +{ + mp_limb_t qhl, cc; + + if (n % 2 != 0) + { + qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1); + cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]); + cc = mpn_sub_1 (np + n, np + n, 1, cc); + if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]); + while (cc) + { + qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1); + cc -= mpn_add_n (np + 1, np + 1, dp, n); + } + qhl += mpn_add_1 (qp + 1, qp + 1, n - 1, + mpn_sb_divrem_mn (qp, np, n + 1, dp, n)); + } + else + { + mp_size_t n2 = n/2; + qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2); + qhl += mpn_add_1 (qp + n2, qp + n2, n2, + mpn_bz_div_3_halves_by_2 (qp, np, dp, n2)); + } + return qhl; +} + + +/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n), + the remainder in (np, 2n) */ + +static mp_limb_t +#if __STDC__ +mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n) +#else +mpn_bz_div_3_halves_by_2 (qp, np, dp, n) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; +#endif +{ + mp_size_t twon = n + n; + mp_limb_t qhl, cc; + mp_ptr tmp; + TMP_DECL (marker); + + TMP_MARK (marker); + if (n < BZ_THRESHOLD) + qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n); + else + qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n); + tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB); + mpn_mul_n (tmp, qp, dp, n); + cc = mpn_sub_n (np, np, tmp, twon); + TMP_FREE (marker); + if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n); + while (cc) + { + qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1); + cc -= mpn_add_n (np, np, dp, twon); + } + return qhl; +} diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c new file mode 100644 index 0000000000..8e9792f54e --- /dev/null +++ b/rts/gmp/mpn/generic/cmp.c @@ -0,0 +1,56 @@ +/* mpn_cmp -- Compare two low-level natural-number integers. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE. + There are no restrictions on the relative sizes of + the two arguments. + Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2. */ + +int +#if __STDC__ +mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size) +#else +mpn_cmp (op1_ptr, op2_ptr, size) + mp_srcptr op1_ptr; + mp_srcptr op2_ptr; + mp_size_t size; +#endif +{ + mp_size_t i; + mp_limb_t op1_word, op2_word; + + for (i = size - 1; i >= 0; i--) + { + op1_word = op1_ptr[i]; + op2_word = op2_ptr[i]; + if (op1_word != op2_word) + goto diff; + } + return 0; + diff: + /* This can *not* be simplified to + op2_word - op2_word + since that expression might give signed overflow. */ + return (op1_word > op2_word) ? 1 : -1; +} diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c new file mode 100644 index 0000000000..a2fb552bfa --- /dev/null +++ b/rts/gmp/mpn/generic/diveby3.c @@ -0,0 +1,77 @@ +/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */ + +/* +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + + +#include "gmp.h" +#include "gmp-impl.h" + + +/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. + 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ +#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) + + +/* The "c += ..."s are adding the high limb of 3*l to c. That high limb + will be 0, 1 or 2. Doing two separate "+="s seems to turn out better + code on gcc (as of 2.95.2 at least). + + When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a + limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3 + gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of + only 0 or 1 respectively. Hence the carry out of each stage and for the + return value is always only 0, 1 or 2. */ + +mp_limb_t +#if __STDC__ +mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c) +#else +mpn_divexact_by3c (dst, src, size, c) + mp_ptr dst; + mp_srcptr src; + mp_size_t size; + mp_limb_t c; +#endif +{ + mp_size_t i; + + ASSERT (size >= 1); + + i = 0; + do + { + mp_limb_t l, s; + + s = src[i]; + l = s - c; + c = (l > s); + + l *= INVERSE_3; + dst[i] = l; + + c += (l > MP_LIMB_T_MAX/3); + c += (l > (MP_LIMB_T_MAX/3)*2); + } + while (++i < size); + + return c; +} diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c new file mode 100644 index 0000000000..30673e76d9 --- /dev/null +++ b/rts/gmp/mpn/generic/divrem.c @@ -0,0 +1,101 @@ +/* mpn_divrem -- Divide natural numbers, producing both remainder and + quotient. This is now just a middle layer for calling the new + internal mpn_tdiv_qr. + +Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +#if __STDC__ +mpn_divrem (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +#else +mpn_divrem (qp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +#endif +{ + if (dn == 1) + { + mp_limb_t ret; + mp_ptr q2p; + mp_size_t qn; + TMP_DECL (marker); + + TMP_MARK (marker); + q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); + + np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]); + qn = nn + qxn - 1; + MPN_COPY (qp, q2p, qn); + ret = q2p[qn]; + + TMP_FREE (marker); + return ret; + } + else if (dn == 2) + { + return mpn_divrem_2 (qp, qxn, np, nn, dp); + } + else + { + mp_ptr rp, q2p; + mp_limb_t qhl; + mp_size_t qn; + TMP_DECL (marker); + + TMP_MARK (marker); + if (qxn != 0) + { + mp_ptr n2p; + n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); + MPN_ZERO (n2p, qxn); + MPN_COPY (n2p + qxn, np, nn); + q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn); + MPN_COPY (np, rp, dn); + qn = nn - dn + qxn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + else + { + q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn); + MPN_COPY (np, rp, dn); /* overwrite np area with remainder */ + qn = nn - dn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + TMP_FREE (marker); + return qhl; + } +} diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c new file mode 100644 index 0000000000..e93f241c9d --- /dev/null +++ b/rts/gmp/mpn/generic/divrem_1.c @@ -0,0 +1,248 @@ +/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + + QUOT_PTR and DIVIDEND_PTR might point to the same limb. + +Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + + +/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb) + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + + QUOT_PTR and DIVIDEND_PTR might point to the same limb. */ + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +static mp_limb_t +#if __STDC__ +__gmpn_divmod_1_internal (mp_ptr quot_ptr, + mp_srcptr dividend_ptr, mp_size_t dividend_size, + mp_limb_t divisor_limb) +#else +__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb) + mp_ptr quot_ptr; + mp_srcptr dividend_ptr; + mp_size_t dividend_size; + mp_limb_t divisor_limb; +#endif +{ + mp_size_t i; + mp_limb_t n1, n0, r; + int dummy; + + /* ??? Should this be handled at all? Rely on callers? */ + if (dividend_size == 0) + return 0; + + /* If multiplication is much faster than division, and the + dividend is large, pre-invert the divisor, and use + only multiplications in the inner loop. */ + + /* This test should be read: + Does it ever help to use udiv_qrnnd_preinv? + && Does what we save compensate for the inversion overhead? */ + if (UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + mp_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + invert_limb (divisor_limb_inverted, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + udiv_qrnnd_preinv (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else + { + mp_limb_t divisor_limb_inverted; + + invert_limb (divisor_limb_inverted, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + { + quot_ptr[i] = 0; + i--; + } + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (quot_ptr[i], r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else + { + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + it, or because DIVISOR_LIMB is already normalized. */ + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + { + quot_ptr[i] = 0; + i--; + } + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb); + } + return r; + } +} + + + +mp_limb_t +#if __STDC__ +mpn_divrem_1 (mp_ptr qp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, + mp_limb_t d) +#else +mpn_divrem_1 (qp, qxn, np, nn, d) + mp_ptr qp; + mp_size_t qxn; + mp_srcptr np; + mp_size_t nn; + mp_limb_t d; +#endif +{ + mp_limb_t rlimb; + mp_size_t i; + + /* Develop integer part of quotient. */ + rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d); + + /* Develop fraction part of quotient. This is not as fast as it should; + the preinvert stuff from __gmpn_divmod_1_internal ought to be used here + too. */ + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, d); + if (normalization_steps != 0) + { + d <<= normalization_steps; + rlimb <<= normalization_steps; + + for (i = qxn - 1; i >= 0; i--) + udiv_qrnnd (qp[i], rlimb, rlimb, 0, d); + + return rlimb >> normalization_steps; + } + else + /* fall out */ + ; + } + + for (i = qxn - 1; i >= 0; i--) + udiv_qrnnd (qp[i], rlimb, rlimb, 0, d); + + return rlimb; +} diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c new file mode 100644 index 0000000000..0bc31ae2e7 --- /dev/null +++ b/rts/gmp/mpn/generic/divrem_2.c @@ -0,0 +1,151 @@ +/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP + RELEASE. + + +Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Divide num (NP/NSIZE) by den (DP/2) and write + the NSIZE-2 least significant quotient limbs at QP + and the 2 long remainder at NP. If QEXTRA_LIMBS is + non-zero, generate that many fraction bits and append them after the + other quotient limbs. + Return the most significant limb of the quotient, this is always 0 or 1. + + Preconditions: + 0. NSIZE >= 2. + 1. The most significant bit of the divisor must be set. + 2. QP must either not overlap with the input operands at all, or + QP + 2 >= NP must hold true. (This means that it's + possible to put the quotient in the high part of NUM, right after the + remainder in NUM. + 3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero. */ + +mp_limb_t +#if __STDC__ +mpn_divrem_2 (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nsize, + mp_srcptr dp) +#else +mpn_divrem_2 (qp, qxn, np, nsize, dp) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nsize; + mp_srcptr dp; +#endif +{ + mp_limb_t most_significant_q_limb = 0; + mp_size_t i; + mp_limb_t n1, n0, n2; + mp_limb_t d1, d0; + mp_limb_t d1inv; + int have_preinv; + + np += nsize - 2; + d1 = dp[1]; + d0 = dp[0]; + n1 = np[1]; + n0 = np[0]; + + if (n1 >= d1 && (n1 > d1 || n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + most_significant_q_limb = 1; + } + + /* If multiplication is much faster than division, preinvert the most + significant divisor limb before entering the loop. */ + if (UDIV_TIME > 2 * UMUL_TIME + 6) + { + have_preinv = 0; + if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME) + { + invert_limb (d1inv, d1); + have_preinv = 1; + } + } + + for (i = qxn + nsize - 2 - 1; i >= 0; i--) + { + mp_limb_t q; + mp_limb_t r; + + if (i >= qxn) + np--; + else + np[0] = 0; + + if (n1 == d1) + { + /* Q should be either 111..111 or 111..110. Need special treatment + of this rare case as normal division would give overflow. */ + q = ~(mp_limb_t) 0; + + r = n0 + d1; + if (r < d1) /* Carry in the addition? */ + { + add_ssaaaa (n1, n0, r - d0, np[0], 0, d0); + qp[i] = q; + continue; + } + n1 = d0 - (d0 != 0); + n0 = -d0; + } + else + { + if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv) + udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv); + else + udiv_qrnnd (q, r, n1, n0, d1); + umul_ppmm (n1, n0, d0, q); + } + + n2 = np[0]; + + q_test: + if (n1 > r || (n1 == r && n0 > n2)) + { + /* The estimated Q was too large. */ + q--; + + sub_ddmmss (n1, n0, n1, n0, 0, d0); + r += d1; + if (r >= d1) /* If not carry, test Q again. */ + goto q_test; + } + + qp[i] = q; + sub_ddmmss (n1, n0, r, n2, n1, n0); + } + np[1] = n1; + np[0] = n0; + + return most_significant_q_limb; +} diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c new file mode 100644 index 0000000000..66f375c74b --- /dev/null +++ b/rts/gmp/mpn/generic/dump.c @@ -0,0 +1,76 @@ +/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO + CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS + FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include <stdio.h> +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpn_dump (mp_srcptr ptr, mp_size_t size) +#else +mpn_dump (ptr, size) + mp_srcptr ptr; + mp_size_t size; +#endif +{ + MPN_NORMALIZE (ptr, size); + + if (size == 0) + printf ("0\n"); + else + { + size--; + if (BYTES_PER_MP_LIMB > sizeof (long)) + { + if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0) + { + printf ("%lX", + (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2)); + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) ptr[size]); + } + else + printf ("%lX", (unsigned long) ptr[size]); + } + else + printf ("%lX", ptr[size]); + + while (size) + { + size--; + if (BYTES_PER_MP_LIMB > sizeof (long)) + { + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2)); + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) ptr[size]); + } + else + printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]); + } + printf ("\n"); + } +} diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c new file mode 100644 index 0000000000..059e219a06 --- /dev/null +++ b/rts/gmp/mpn/generic/gcd.c @@ -0,0 +1,414 @@ +/* mpn/gcd.c: mpn_gcd for gcd of two odd integers. + +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* Integer greatest common divisor of two unsigned integers, using + the accelerated algorithm (see reference below). + + mp_size_t mpn_gcd (up, usize, vp, vsize). + + Preconditions [U = (up, usize) and V = (vp, vsize)]: + + 1. V is odd. + 2. numbits(U) >= numbits(V). + + Both U and V are destroyed by the operation. The result is left at vp, + and its size is returned. + + Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu) + + Funding for this work has been partially provided by Conselho Nacional + de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant + 301314194-2, and was done while I was a visiting reseacher in the Instituto + de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS). + + Refer to + K. Weber, The accelerated integer GCD algorithm, ACM Transactions on + Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated + algorithm is used, otherwise the binary algorithm is used. This may be + adjusted for different architectures. */ +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif + +/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated + algorithm reduces using the bmod operation. Otherwise, the k-ary reduction + is used. 0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB. */ +enum + { + BMOD_THRESHOLD = BITS_PER_MP_LIMB/2 + }; + + +/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2. + Both U and V must be odd. */ +static __gmp_inline mp_size_t +#if __STDC__ +gcd_2 (mp_ptr vp, mp_srcptr up) +#else +gcd_2 (vp, up) + mp_ptr vp; + mp_srcptr up; +#endif +{ + mp_limb_t u0, u1, v0, v1; + mp_size_t vsize; + + u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1]; + + while (u1 != v1 && u0 != v0) + { + unsigned long int r; + if (u1 > v1) + { + u1 -= v1 + (u0 < v0), u0 -= v0; + count_trailing_zeros (r, u0); + u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r; + u1 >>= r; + } + else /* u1 < v1. */ + { + v1 -= u1 + (v0 < u0), v0 -= u0; + count_trailing_zeros (r, v0); + v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r; + v1 >>= r; + } + } + + vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0); + + /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */ + if (u1 == v1 && u0 == v0) + return vsize; + + v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0; + vp[0] = mpn_gcd_1 (vp, vsize, v0); + + return 1; +} + +/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists + 0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB). + In the reference article, D was computed along with N, but it is better to + compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating + the result as a twos' complement signed integer. + + Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB). According to the reference + article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use + 2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double + precision. If N2 > N1 initially, the first iteration of the while loop + will swap them. In all other situations, N1 >= N2 is maintained. */ + +static +#if ! defined (__i386__) +__gmp_inline /* don't inline this for the x86 */ +#endif +mp_limb_t +#if __STDC__ +find_a (mp_srcptr cp) +#else +find_a (cp) + mp_srcptr cp; +#endif +{ + unsigned long int leading_zero_bits = 0; + + mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l. */ + mp_limb_t n1_h = cp[1]; + + mp_limb_t n2_l = -n1_l; /* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l. */ + mp_limb_t n2_h = ~n1_h; + + /* Main loop. */ + while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */ + { + /* N1 <-- N1 % N2. */ + if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0) + { + unsigned long int i; + count_leading_zeros (i, n2_h); + i -= leading_zero_bits, leading_zero_bits += i; + n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i; + do + { + if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) + n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l; + n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1; + i -= 1; + } + while (i); + } + if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) + n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l; + + MP_LIMB_T_SWAP (n1_h, n2_h); + MP_LIMB_T_SWAP (n1_l, n2_l); + } + + return n2_l; +} + +mp_size_t +#if __STDC__ +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +#else +mpn_gcd (gp, up, usize, vp, vsize) + mp_ptr gp; + mp_ptr up; + mp_size_t usize; + mp_ptr vp; + mp_size_t vsize; +#endif +{ + mp_ptr orig_vp = vp; + mp_size_t orig_vsize = vsize; + int binary_gcd_ctr; /* Number of times binary gcd will execute. */ + TMP_DECL (marker); + + TMP_MARK (marker); + + /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD. + Two EXTRA limbs for U and V are required for kary reduction. */ + if (vsize >= GCD_ACCEL_THRESHOLD) + { + unsigned long int vbitsize, d; + mp_ptr orig_up = up; + mp_size_t orig_usize = usize; + mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB); + + MPN_COPY (anchor_up, orig_up, usize); + up = anchor_up; + + count_leading_zeros (d, up[usize-1]); + d = usize * BITS_PER_MP_LIMB - d; + count_leading_zeros (vbitsize, vp[vsize-1]); + vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize; + d = d - vbitsize + 1; + + /* Use bmod reduction to quickly discover whether V divides U. */ + up[usize++] = 0; /* Insert leading zero. */ + mpn_bdivmod (up, up, usize, vp, vsize, d); + + /* Now skip U/V mod 2^d and any low zero limbs. */ + d /= BITS_PER_MP_LIMB, up += d, usize -= d; + while (usize != 0 && up[0] == 0) + up++, usize--; + + if (usize == 0) /* GCD == ORIG_V. */ + goto done; + + vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB); + MPN_COPY (vp, orig_vp, vsize); + + do /* Main loop. */ + { + /* mpn_com_n can't be used here because anchor_up and up may + partially overlap */ + if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */ + { + mp_size_t i; + anchor_up[0] = -up[0]; + for (i = 1; i < usize; i++) + anchor_up[i] = ~up[i]; + up = anchor_up; + } + + MPN_NORMALIZE_NOT_ZERO (up, usize); + + if ((up[0] & 1) == 0) /* Result even; remove twos. */ + { + unsigned int r; + count_trailing_zeros (r, up[0]); + mpn_rshift (anchor_up, up, usize, r); + usize -= (anchor_up[usize-1] == 0); + } + else if (anchor_up != up) + MPN_COPY_INCR (anchor_up, up, usize); + + MPN_PTR_SWAP (anchor_up,usize, vp,vsize); + up = anchor_up; + + if (vsize <= 2) /* Kary can't handle < 2 limbs and */ + break; /* isn't efficient for == 2 limbs. */ + + d = vbitsize; + count_leading_zeros (vbitsize, vp[vsize-1]); + vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize; + d = d - vbitsize + 1; + + if (d > BMOD_THRESHOLD) /* Bmod reduction. */ + { + up[usize++] = 0; + mpn_bdivmod (up, up, usize, vp, vsize, d); + d /= BITS_PER_MP_LIMB, up += d, usize -= d; + } + else /* Kary reduction. */ + { + mp_limb_t bp[2], cp[2]; + + /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */ + { + mp_limb_t u_inv, hi, lo; + modlimb_invert (u_inv, up[0]); + cp[0] = vp[0] * u_inv; + umul_ppmm (hi, lo, cp[0], up[0]); + cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv; + } + + /* U <-- find_a (C) * U. */ + up[usize] = mpn_mul_1 (up, up, usize, find_a (cp)); + usize++; + + /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1). + bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and + bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 + + Like V/U above, but simplified because only the low bit of + bp[1] is wanted. */ + { + mp_limb_t v_inv, hi, lo; + modlimb_invert (v_inv, vp[0]); + bp[0] = up[0] * v_inv; + umul_ppmm (hi, lo, bp[0], vp[0]); + bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1; + } + + up[usize++] = 0; + if (bp[1]) /* B < 0: U <-- U + (-B) * V. */ + { + mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]); + mpn_add_1 (up + vsize, up + vsize, usize - vsize, c); + } + else /* B >= 0: U <-- U - B * V. */ + { + mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]); + mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); + } + + up += 2, usize -= 2; /* At least two low limbs are zero. */ + } + + /* Must remove low zero limbs before complementing. */ + while (usize != 0 && up[0] == 0) + up++, usize--; + } + while (usize); + + /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */ + up = orig_up, usize = orig_usize; + binary_gcd_ctr = 2; + } + else + binary_gcd_ctr = 1; + + /* Finish up with the binary algorithm. Executes once or twice. */ + for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize) + { + if (usize > 2) /* First make U close to V in size. */ + { + unsigned long int vbitsize, d; + count_leading_zeros (d, up[usize-1]); + d = usize * BITS_PER_MP_LIMB - d; + count_leading_zeros (vbitsize, vp[vsize-1]); + vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize; + d = d - vbitsize - 1; + if (d != -(unsigned long int)1 && d > 2) + { + mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */ + d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d; + } + } + + /* Start binary GCD. */ + do + { + mp_size_t zeros; + + /* Make sure U is odd. */ + MPN_NORMALIZE (up, usize); + while (up[0] == 0) + up += 1, usize -= 1; + if ((up[0] & 1) == 0) + { + unsigned int r; + count_trailing_zeros (r, up[0]); + mpn_rshift (up, up, usize, r); + usize -= (up[usize-1] == 0); + } + + /* Keep usize >= vsize. */ + if (usize < vsize) + MPN_PTR_SWAP (up, usize, vp, vsize); + + if (usize <= 2) /* Double precision. */ + { + if (vsize == 1) + vp[0] = mpn_gcd_1 (up, usize, vp[0]); + else + vsize = gcd_2 (vp, up); + break; /* Binary GCD done. */ + } + + /* Count number of low zero limbs of U - V. */ + for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; ) + continue; + + /* If U < V, swap U and V; in any case, subtract V from U. */ + if (zeros == vsize) /* Subtract done. */ + up += zeros, usize -= zeros; + else if (usize == vsize) + { + mp_size_t size = vsize; + do + size--; + while (up[size] == vp[size]); + if (up[size] < vp[size]) /* usize == vsize. */ + MP_PTR_SWAP (up, vp); + up += zeros, usize = size + 1 - zeros; + mpn_sub_n (up, up, vp + zeros, usize); + } + else + { + mp_size_t size = vsize - zeros; + up += zeros, usize -= zeros; + if (mpn_sub_n (up, up, vp + zeros, size)) + { + while (up[size] == 0) /* Propagate borrow. */ + up[size++] = -(mp_limb_t)1; + up[size] -= 1; + } + } + } + while (usize); /* End binary GCD. */ + } + +done: + if (vp != gp) + MPN_COPY (gp, vp, vsize); + TMP_FREE (marker); + return vsize; +} diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c new file mode 100644 index 0000000000..1832636636 --- /dev/null +++ b/rts/gmp/mpn/generic/gcd_1.c @@ -0,0 +1,77 @@ +/* mpn_gcd_1 -- + +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Does not work for U == 0 or V == 0. It would be tough to make it work for + V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. */ + +mp_limb_t +#if __STDC__ +mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb) +#else +mpn_gcd_1 (up, size, vlimb) + mp_srcptr up; + mp_size_t size; + mp_limb_t vlimb; +#endif +{ + mp_limb_t ulimb; + unsigned long int u_low_zero_bits, v_low_zero_bits; + + if (size > 1) + { + ulimb = mpn_mod_1 (up, size, vlimb); + if (ulimb == 0) + return vlimb; + } + else + ulimb = up[0]; + + /* Need to eliminate low zero bits. */ + count_trailing_zeros (u_low_zero_bits, ulimb); + ulimb >>= u_low_zero_bits; + + count_trailing_zeros (v_low_zero_bits, vlimb); + vlimb >>= v_low_zero_bits; + + while (ulimb != vlimb) + { + if (ulimb > vlimb) + { + ulimb -= vlimb; + do + ulimb >>= 1; + while ((ulimb & 1) == 0); + } + else /* vlimb > ulimb. */ + { + vlimb -= ulimb; + do + vlimb >>= 1; + while ((vlimb & 1) == 0); + } + } + + return ulimb << MIN (u_low_zero_bits, v_low_zero_bits); +} diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c new file mode 100644 index 0000000000..fe22d779a6 --- /dev/null +++ b/rts/gmp/mpn/generic/gcdext.c @@ -0,0 +1,700 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 17 +#endif + +#ifndef EXTEND +#define EXTEND 1 +#endif + +#if STAT +int arr[BITS_PER_MP_LIMB]; +#endif + + +/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE) + + Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the + greatest common divisor at GP (unless it is 0), and the first cofactor at + SP. Write the size of the cofactor through the pointer SSIZE. Return the + size of the value at GP. Note that SP might be a negative number; this is + denoted by storing the negative of the size through SSIZE. + + {UP,USIZE} and {VP,VSIZE} are both clobbered. + + The space allocation for all four areas needs to be USIZE+1. + + Preconditions: 1) U >= V. + 2) V > 0. */ + +/* We use Lehmer's algorithm. The idea is to extract the most significant + bits of the operands, and compute the continued fraction for them. We then + apply the gathered cofactors to the full operands. + + Idea 1: After we have performed a full division, don't shift operands back, + but instead account for the extra factors-of-2 thus introduced. + Idea 2: Simple generalization to use divide-and-conquer would give us an + algorithm that runs faster than O(n^2). + Idea 3: The input numbers need less space as the computation progresses, + while the s0 and s1 variables need more space. To save memory, we + could make them share space, and have the latter variables grow + into the former. + Idea 4: We should not do double-limb arithmetic from the start. Instead, + do things in single-limb arithmetic until the quotients differ, + and then switch to double-limb arithmetic. */ + + +/* Division optimized for small quotients. If the quotient is more than one limb, + store 1 in *qh and return 0. */ +static mp_limb_t +#if __STDC__ +div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0) +#else +div2 (qh, n1, n0, d1, d0) + mp_limb_t *qh; + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d1; + mp_limb_t d0; +#endif +{ + if (d1 == 0) + { + *qh = 1; + return 0; + } + + if ((mp_limb_signed_t) n1 < 0) + { + mp_limb_t q; + int cnt; + for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++) + { + d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1)); + d0 = d0 << 1; + } + + q = 0; + while (cnt) + { + q <<= 1; + if (n1 > d1 || (n1 == d1 && n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + q |= 1; + } + d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1); + d1 = d1 >> 1; + cnt--; + } + + *qh = 0; + return q; + } + else + { + mp_limb_t q; + int cnt; + for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++) + { + d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1)); + d0 = d0 << 1; + } + + q = 0; + while (cnt) + { + d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1); + d1 = d1 >> 1; + q <<= 1; + if (n1 > d1 || (n1 == d1 && n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + q |= 1; + } + cnt--; + } + + *qh = 0; + return q; + } +} + +mp_size_t +#if EXTEND +#if __STDC__ +mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size, + mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) +#else +mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize) + mp_ptr gp; + mp_ptr s0p; + mp_size_t *s0size; + mp_ptr up; + mp_size_t size; + mp_ptr vp; + mp_size_t vsize; +#endif +#else +#if __STDC__ +mpn_gcd (mp_ptr gp, + mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) +#else +mpn_gcd (gp, up, size, vp, vsize) + mp_ptr gp; + mp_ptr up; + mp_size_t size; + mp_ptr vp; + mp_size_t vsize; +#endif +#endif +{ + mp_limb_t A, B, C, D; + int cnt; + mp_ptr tp, wp; +#if RECORD + mp_limb_t max = 0; +#endif +#if EXTEND + mp_ptr s1p; + mp_ptr orig_s0p = s0p; + mp_size_t ssize; + int sign = 1; +#endif + int use_double_flag; + TMP_DECL (mark); + + TMP_MARK (mark); + + use_double_flag = (size >= GCDEXT_THRESHOLD); + + tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); + wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); +#if EXTEND + s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); + + MPN_ZERO (s0p, size); + MPN_ZERO (s1p, size); + + s0p[0] = 1; + s1p[0] = 0; + ssize = 1; +#endif + + if (size > vsize) + { + /* Normalize V (and shift up U the same amount). */ + count_leading_zeros (cnt, vp[vsize - 1]); + if (cnt != 0) + { + mp_limb_t cy; + mpn_lshift (vp, vp, vsize, cnt); + cy = mpn_lshift (up, up, size, cnt); + up[size] = cy; + size += cy != 0; + } + + mpn_divmod (up + vsize, up, size, vp, vsize); +#if EXTEND + /* This is really what it boils down to in this case... */ + s0p[0] = 0; + s1p[0] = 1; + sign = -sign; +#endif + size = vsize; + if (cnt != 0) + { + mpn_rshift (up, up, size, cnt); + mpn_rshift (vp, vp, size, cnt); + } + MP_PTR_SWAP (up, vp); + } + + for (;;) + { + mp_limb_t asign; + /* Figure out exact size of V. */ + vsize = size; + MPN_NORMALIZE (vp, vsize); + if (vsize <= 1) + break; + + if (use_double_flag) + { + mp_limb_t uh, vh, ul, vl; + /* Let UH,UL be the most significant limbs of U, and let VH,VL be + the corresponding bits from V. */ + uh = up[size - 1]; + vh = vp[size - 1]; + ul = up[size - 2]; + vl = vp[size - 2]; + count_leading_zeros (cnt, uh); + if (cnt != 0) + { + uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt)); + vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt)); + vl <<= cnt; + ul <<= cnt; + if (size >= 3) + { + ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt)); + vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt)); + } + } + + A = 1; + B = 0; + C = 0; + D = 1; + + asign = 0; + for (;;) + { + mp_limb_t T; + mp_limb_t qh, q1, q2; + mp_limb_t nh, nl, dh, dl; + mp_limb_t t1, t0; + mp_limb_t Th, Tl; + + sub_ddmmss (dh, dl, vh, vl, 0, C); + if ((dl | dh) == 0) + break; + add_ssaaaa (nh, nl, uh, ul, 0, A); + q1 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + add_ssaaaa (dh, dl, vh, vl, 0, D); + if ((dl | dh) == 0) + break; + sub_ddmmss (nh, nl, uh, ul, 0, B); + q2 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + if (q1 != q2) + break; + + asign = ~asign; + + T = A + q1 * C; + A = C; + C = T; + T = B + q1 * D; + B = D; + D = T; + umul_ppmm (t1, t0, q1, vl); + t1 += q1 * vh; + sub_ddmmss (Th, Tl, uh, ul, t1, t0); + uh = vh, ul = vl; + vh = Th, vl = Tl; + + add_ssaaaa (dh, dl, vh, vl, 0, C); + sub_ddmmss (nh, nl, uh, ul, 0, A); + q1 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + sub_ddmmss (dh, dl, vh, vl, 0, D); + if ((dl | dh) == 0) + break; + add_ssaaaa (nh, nl, uh, ul, 0, B); + q2 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + if (q1 != q2) + break; + + asign = ~asign; + + T = A + q1 * C; + A = C; + C = T; + T = B + q1 * D; + B = D; + D = T; + umul_ppmm (t1, t0, q1, vl); + t1 += q1 * vh; + sub_ddmmss (Th, Tl, uh, ul, t1, t0); + uh = vh, ul = vl; + vh = Th, vl = Tl; + } +#if EXTEND + if (asign) + sign = -sign; +#endif + } + else /* Same, but using single-limb calculations. */ + { + mp_limb_t uh, vh; + /* Make UH be the most significant limb of U, and make VH be + corresponding bits from V. */ + uh = up[size - 1]; + vh = vp[size - 1]; + count_leading_zeros (cnt, uh); + if (cnt != 0) + { + uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt)); + vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt)); + } + + A = 1; + B = 0; + C = 0; + D = 1; + + asign = 0; + for (;;) + { + mp_limb_t q, T; + if (vh - C == 0 || vh + D == 0) + break; + + q = (uh + A) / (vh - C); + if (q != (uh - B) / (vh + D)) + break; + + asign = ~asign; + + T = A + q * C; + A = C; + C = T; + T = B + q * D; + B = D; + D = T; + T = uh - q * vh; + uh = vh; + vh = T; + + if (vh - D == 0) + break; + + q = (uh - A) / (vh + C); + if (q != (uh + B) / (vh - D)) + break; + + asign = ~asign; + + T = A + q * C; + A = C; + C = T; + T = B + q * D; + B = D; + D = T; + T = uh - q * vh; + uh = vh; + vh = T; + } +#if EXTEND + if (asign) + sign = -sign; +#endif + } + +#if RECORD + max = MAX (A, max); max = MAX (B, max); + max = MAX (C, max); max = MAX (D, max); +#endif + + if (B == 0) + { + mp_limb_t qh; + mp_size_t i; + /* This is quite rare. I.e., optimize something else! */ + + /* Normalize V (and shift up U the same amount). */ + count_leading_zeros (cnt, vp[vsize - 1]); + if (cnt != 0) + { + mp_limb_t cy; + mpn_lshift (vp, vp, vsize, cnt); + cy = mpn_lshift (up, up, size, cnt); + up[size] = cy; + size += cy != 0; + } + + qh = mpn_divmod (up + vsize, up, size, vp, vsize); +#if EXTEND + MPN_COPY (tp, s0p, ssize); + { + mp_size_t qsize; + + qsize = size - vsize; /* size of stored quotient from division */ + if (ssize < qsize) + { + MPN_ZERO (tp + ssize, qsize - ssize); + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < ssize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]); + tp[qsize + i] = cy; + } + if (qh != 0) + { + mp_limb_t cy; + cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize); + if (cy != 0) + abort (); + } + } + else + { + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < qsize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]); + tp[ssize + i] = cy; + } + if (qh != 0) + { + mp_limb_t cy; + cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize); + if (cy != 0) + { + tp[qsize + ssize] = cy; + s1p[qsize + ssize] = 0; + ssize++; + } + } + } + ssize += qsize; + ssize -= tp[ssize - 1] == 0; + } + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); +#endif + size = vsize; + if (cnt != 0) + { + mpn_rshift (up, up, size, cnt); + mpn_rshift (vp, vp, size, cnt); + } + MP_PTR_SWAP (up, vp); + } + else + { +#if EXTEND + mp_size_t tsize, wsize; +#endif + /* T = U*A + V*B + W = U*C + V*D + U = T + V = W */ + +#if STAT + { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x); + arr[BITS_PER_MP_LIMB - cnt]++; } +#endif + if (A == 0) + { + /* B == 1 and C == 1 (D is arbitrary) */ + mp_limb_t cy; + MPN_COPY (tp, vp, size); + MPN_COPY (wp, up, size); + mpn_submul_1 (wp, vp, size, D); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND + MPN_COPY (tp, s1p, ssize); + tsize = ssize; + tp[ssize] = 0; /* must zero since wp might spill below */ + MPN_COPY (wp, s0p, ssize); + cy = mpn_addmul_1 (wp, s1p, ssize, D); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif + } + else + { + if (asign) + { + mp_limb_t cy; + mpn_mul_1 (tp, vp, size, B); + mpn_submul_1 (tp, up, size, A); + mpn_mul_1 (wp, up, size, C); + mpn_submul_1 (wp, vp, size, D); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND + cy = mpn_mul_1 (tp, s1p, ssize, B); + cy += mpn_addmul_1 (tp, s0p, ssize, A); + tp[ssize] = cy; + tsize = ssize + (cy != 0); + cy = mpn_mul_1 (wp, s0p, ssize, C); + cy += mpn_addmul_1 (wp, s1p, ssize, D); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif + } + else + { + mp_limb_t cy; + mpn_mul_1 (tp, up, size, A); + mpn_submul_1 (tp, vp, size, B); + mpn_mul_1 (wp, vp, size, D); + mpn_submul_1 (wp, up, size, C); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND + cy = mpn_mul_1 (tp, s0p, ssize, A); + cy += mpn_addmul_1 (tp, s1p, ssize, B); + tp[ssize] = cy; + tsize = ssize + (cy != 0); + cy = mpn_mul_1 (wp, s1p, ssize, D); + cy += mpn_addmul_1 (wp, s0p, ssize, C); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif + } + } + + size -= up[size - 1] == 0; + } + } + +#if RECORD + printf ("max: %lx\n", max); +#endif + +#if STAT + {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);} +#endif + + if (vsize == 0) + { + if (gp != up && gp != 0) + MPN_COPY (gp, up, size); +#if EXTEND + MPN_NORMALIZE (s0p, ssize); + if (orig_s0p != s0p) + MPN_COPY (orig_s0p, s0p, ssize); + *s0size = sign >= 0 ? ssize : -ssize; +#endif + TMP_FREE (mark); + return size; + } + else + { + mp_limb_t vl, ul, t; +#if EXTEND + mp_size_t qsize, i; +#endif + vl = vp[0]; +#if EXTEND + t = mpn_divmod_1 (wp, up, size, vl); + + MPN_COPY (tp, s0p, ssize); + + qsize = size - (wp[size - 1] == 0); /* size of quotient from division */ + if (ssize < qsize) + { + MPN_ZERO (tp + ssize, qsize - ssize); + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < ssize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]); + tp[qsize + i] = cy; + } + } + else + { + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < qsize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); + tp[ssize + i] = cy; + } + } + ssize += qsize; + ssize -= tp[ssize - 1] == 0; + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); +#else + t = mpn_mod_1 (up, size, vl); +#endif + ul = vl; + vl = t; + while (vl != 0) + { + mp_limb_t t; +#if EXTEND + mp_limb_t q; + q = ul / vl; + t = ul - q * vl; + + MPN_COPY (tp, s0p, ssize); + + MPN_ZERO (s1p + ssize, 1); /* zero s1 too */ + + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp, s1p, ssize, q); + tp[ssize] = cy; + } + + ssize += 1; + ssize -= tp[ssize - 1] == 0; + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); +#else + t = ul % vl; +#endif + ul = vl; + vl = t; + } + if (gp != 0) + gp[0] = ul; +#if EXTEND + MPN_NORMALIZE (s0p, ssize); + if (orig_s0p != s0p) + MPN_COPY (orig_s0p, s0p, ssize); + *s0size = sign >= 0 ? ssize : -ssize; +#endif + TMP_FREE (mark); + return 1; + } +} diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c new file mode 100644 index 0000000000..a713b61825 --- /dev/null +++ b/rts/gmp/mpn/generic/get_str.c @@ -0,0 +1,216 @@ +/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR + to a printable string in STR in base BASE. + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Convert the limb vector pointed to by MPTR and MSIZE long to a + char array, using base BASE for the result array. Store the + result in the character array STR. STR must point to an array with + space for the largest possible number represented by a MSIZE long + limb vector + 1 extra character. + + The result is NOT in Ascii, to convert it to printable format, add + '0' or 'A' depending on the base and range. + + Return the number of digits in the result string. + This may include some leading zeros. + + The limb vector pointed to by MPTR is clobbered. */ + +size_t +#if __STDC__ +mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize) +#else +mpn_get_str (str, base, mptr, msize) + unsigned char *str; + int base; + mp_ptr mptr; + mp_size_t msize; +#endif +{ + mp_limb_t big_base; +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + int normalization_steps; +#endif +#if UDIV_TIME > 2 * UMUL_TIME + mp_limb_t big_base_inverted; +#endif + unsigned int dig_per_u; + mp_size_t out_len; + register unsigned char *s; + + big_base = __mp_bases[base].big_base; + + s = str; + + /* Special case zero, as the code below doesn't handle it. */ + if (msize == 0) + { + s[0] = 0; + return 1; + } + + if ((base & (base - 1)) == 0) + { + /* The base is a power of 2. Make conversion from most + significant side. */ + mp_limb_t n1, n0; + register int bits_per_digit = big_base; + register int x; + register int bit_pos; + register int i; + + n1 = mptr[msize - 1]; + count_leading_zeros (x, n1); + + /* BIT_POS should be R when input ends in least sign. nibble, + R + bits_per_digit * n when input ends in n:th least significant + nibble. */ + + { + int bits; + + bits = BITS_PER_MP_LIMB * msize - x; + x = bits % bits_per_digit; + if (x != 0) + bits += bits_per_digit - x; + bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB; + } + + /* Fast loop for bit output. */ + i = msize - 1; + for (;;) + { + bit_pos -= bits_per_digit; + while (bit_pos >= 0) + { + *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1); + bit_pos -= bits_per_digit; + } + i--; + if (i < 0) + break; + n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1); + n1 = mptr[i]; + bit_pos += BITS_PER_MP_LIMB; + *s++ = n0 | (n1 >> bit_pos); + } + + *s = 0; + + return s - str; + } + else + { + /* General case. The base is not a power of 2. Make conversion + from least significant end. */ + + /* If udiv_qrnnd only handles divisors with the most significant bit + set, prepare BIG_BASE for being a divisor by shifting it to the + left exactly enough to set the most significant bit. */ +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + count_leading_zeros (normalization_steps, big_base); + big_base <<= normalization_steps; +#if UDIV_TIME > 2 * UMUL_TIME + /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS). */ + big_base_inverted = __mp_bases[base].big_base_inverted; +#endif +#endif + + dig_per_u = __mp_bases[base].chars_per_limb; + out_len = ((size_t) msize * BITS_PER_MP_LIMB + * __mp_bases[base].chars_per_bit_exactly) + 1; + s += out_len; + + while (msize != 0) + { + int i; + mp_limb_t n0, n1; + +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + /* If we shifted BIG_BASE above, shift the dividend too, to get + the right quotient. We need to do this every loop, + since the intermediate quotients are OK, but the quotient from + one turn in the loop is going to be the dividend in the + next turn, and the dividend needs to be up-shifted. */ + if (normalization_steps != 0) + { + n0 = mpn_lshift (mptr, mptr, msize, normalization_steps); + + /* If the shifting gave a carry out limb, store it and + increase the length. */ + if (n0 != 0) + { + mptr[msize] = n0; + msize++; + } + } +#endif + + /* Divide the number at TP with BIG_BASE to get a quotient and a + remainder. The remainder is our new digit in base BIG_BASE. */ + i = msize - 1; + n1 = mptr[i]; + + if (n1 >= big_base) + n1 = 0; + else + { + msize--; + i--; + } + + for (; i >= 0; i--) + { + n0 = mptr[i]; +#if UDIV_TIME > 2 * UMUL_TIME + udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted); +#else + udiv_qrnnd (mptr[i], n1, n1, n0, big_base); +#endif + } + +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests) + the remainder will be up-shifted here. Compensate. */ + n1 >>= normalization_steps; +#endif + + /* Convert N1 from BIG_BASE to a string of digits in BASE + using single precision operations. */ + for (i = dig_per_u - 1; i >= 0; i--) + { + *--s = n1 % base; + n1 /= base; + if (n1 == 0 && msize == 0) + break; + } + } + + while (s != str) + *--s = 0; + return out_len; + } +} diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h new file mode 100644 index 0000000000..14bcaece83 --- /dev/null +++ b/rts/gmp/mpn/generic/gmp-mparam.h @@ -0,0 +1,27 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c new file mode 100644 index 0000000000..35c10e8450 --- /dev/null +++ b/rts/gmp/mpn/generic/hamdist.c @@ -0,0 +1,94 @@ +/* mpn_hamdist -- + +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if defined __GNUC__ +/* No processor claiming to be SPARC v9 compliant seem to + implement the POPC instruction. Disable pattern for now. */ +#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 +#define popc_limb(a) \ + ({ \ + DItype __res; \ + asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \ + __res; \ + }) +#endif +#endif + +#ifndef popc_limb + +/* Cool population count of a mp_limb_t. + You have to figure out how this works, I won't tell you! */ + +static inline unsigned int +#if __STDC__ +popc_limb (mp_limb_t x) +#else +popc_limb (x) + mp_limb_t x; +#endif +{ +#if BITS_PER_MP_LIMB == 64 + /* We have to go into some trouble to define these constants. + (For mp_limb_t being `long long'.) */ + mp_limb_t cnst; + cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2); + x -= (x & cnst) >> 1; + cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2); + x = ((x & ~cnst) >> 2) + (x & cnst); + cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2); + x = ((x >> 4) + x) & cnst; + x = ((x >> 8) + x); + x = ((x >> 16) + x); + x = ((x >> 32) + x) & 0xff; +#endif +#if BITS_PER_MP_LIMB == 32 + x -= (x & 0xaaaaaaaa) >> 1; + x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L); + x = ((x >> 4) + x) & 0x0f0f0f0fL; + x = ((x >> 8) + x); + x = ((x >> 16) + x) & 0xff; +#endif + return x; +} +#endif + +unsigned long int +#if __STDC__ +mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size) +#else +mpn_hamdist (up, vp, size) + register mp_srcptr up; + register mp_srcptr vp; + register mp_size_t size; +#endif +{ + unsigned long int hamdist; + mp_size_t i; + + hamdist = 0; + for (i = 0; i < size; i++) + hamdist += popc_limb (up[i] ^ vp[i]); + + return hamdist; +} diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c new file mode 100644 index 0000000000..9487e58cf2 --- /dev/null +++ b/rts/gmp/mpn/generic/inlines.c @@ -0,0 +1,24 @@ +/* +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#define _FORCE_INLINES +#define _EXTERN_INLINE /* empty */ +#include "gmp.h" diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c new file mode 100644 index 0000000000..dd437f1ac1 --- /dev/null +++ b/rts/gmp/mpn/generic/jacbase.c @@ -0,0 +1,136 @@ +/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments. + + THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO + INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +#if COUNT_TRAILING_ZEROS_TIME <= 7 +/* If count_trailing_zeros is fast, use it. + K7 at 7 cycles and P6 at 2 are good here. K6 at 12-27 and P5 at 18-42 + are not. The default 15 in longlong.h is meant to mean not good here. */ + +#define PROCESS_TWOS_ANY \ + { \ + mp_limb_t twos; \ + count_trailing_zeros (twos, a); \ + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \ + a >>= twos; \ + } + +#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY + +#else +/* Use a loop instead. With "a" uniformly distributed there will usually be + only a few trailing zeros. + + Unfortunately the branch for the while loop here will be on a 50/50 + chance of a 1 or 0, which is bad for branch prediction. */ + +#define PROCESS_TWOS_EVEN \ + { \ + int two; \ + two = JACOBI_TWO_U_BIT1 (b); \ + do \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + while ((a & 1) == 0); \ + } + +#define PROCESS_TWOS_ANY \ + if ((a & 1) == 0) \ + PROCESS_TWOS_EVEN; + +#endif + + +/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but + with a restricted range of inputs accepted, namely b>1, b odd, and a<=b. + + The initial result_bit1 is taken as a parameter for the convenience of + mpz_kronecker_zi_ui() et al. The sign changes both here and in those + routines accumulate nicely in bit 1, see the JACOBI macros. + + The return value here is the normal +1, 0, or -1. Note that +1 and -1 + have bit 1 in the "BIT1" sense, which could be useful if the caller is + accumulating it into some extended calculation. + + Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be + possible, but a couple of tests suggest it's not a significant speedup, + and may even be a slowdown, so what's here is good enough for now. + + Future: The code doesn't demand a<=b actually, so maybe this could be + relaxed. All the places this is used currently call with a<=b though. */ + +int +#if __STDC__ +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1) +#else +mpn_jacobi_base (a, b, result_bit1) + mp_limb_t a; + mp_limb_t b; + int result_bit1; +#endif +{ + ASSERT (b & 1); /* b odd */ + ASSERT (b != 1); + ASSERT (a <= b); + + if (a == 0) + return 0; + + PROCESS_TWOS_ANY; + if (a == 1) + goto done; + + for (;;) + { + result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b); + MP_LIMB_T_SWAP (a, b); + + do + { + /* working on (a/b), a,b odd, a>=b */ + ASSERT (a & 1); + ASSERT (b & 1); + ASSERT (a >= b); + + if ((a -= b) == 0) + return 0; + + PROCESS_TWOS_EVEN; + if (a == 1) + goto done; + } + while (a >= b); + } + + done: + return JACOBI_BIT1_TO_PN (result_bit1); +} diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c new file mode 100644 index 0000000000..0b58389658 --- /dev/null +++ b/rts/gmp/mpn/generic/lshift.c @@ -0,0 +1,87 @@ +/* mpn_lshift -- Shift left low level. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left + and store the USIZE least significant digits of the result at WP. + Return the bits shifted out from the most significant digit. + + Argument constraints: + 1. 0 < CNT < BITS_PER_MP_LIMB + 2. If the result is to be written over the input, WP must be >= UP. +*/ + +mp_limb_t +#if __STDC__ +mpn_lshift (register mp_ptr wp, + register mp_srcptr up, mp_size_t usize, + register unsigned int cnt) +#else +mpn_lshift (wp, up, usize, cnt) + register mp_ptr wp; + register mp_srcptr up; + mp_size_t usize; + register unsigned int cnt; +#endif +{ + register mp_limb_t high_limb, low_limb; + register unsigned sh_1, sh_2; + register mp_size_t i; + mp_limb_t retval; + +#ifdef DEBUG + if (usize == 0 || cnt == 0) + abort (); +#endif + + sh_1 = cnt; +#if 0 + if (sh_1 == 0) + { + if (wp != up) + { + /* Copy from high end to low end, to allow specified input/output + overlapping. */ + for (i = usize - 1; i >= 0; i--) + wp[i] = up[i]; + } + return 0; + } +#endif + + wp += 1; + sh_2 = BITS_PER_MP_LIMB - sh_1; + i = usize - 1; + low_limb = up[i]; + retval = low_limb >> sh_2; + high_limb = low_limb; + while (--i >= 0) + { + low_limb = up[i]; + wp[i] = (high_limb << sh_1) | (low_limb >> sh_2); + high_limb = low_limb; + } + wp[i] = high_limb << sh_1; + + return retval; +} diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c new file mode 100644 index 0000000000..168ec9df49 --- /dev/null +++ b/rts/gmp/mpn/generic/mod_1.c @@ -0,0 +1,175 @@ +/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + +Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +mp_limb_t +#if __STDC__ +mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size, + mp_limb_t divisor_limb) +#else +mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb) + mp_srcptr dividend_ptr; + mp_size_t dividend_size; + mp_limb_t divisor_limb; +#endif +{ + mp_size_t i; + mp_limb_t n1, n0, r; + int dummy; + + /* Botch: Should this be handled at all? Rely on callers? */ + if (dividend_size == 0) + return 0; + + /* If multiplication is much faster than division, and the + dividend is large, pre-invert the divisor, and use + only multiplications in the inner loop. */ + + /* This test should be read: + Does it ever help to use udiv_qrnnd_preinv? + && Does what we save compensate for the inversion overhead? */ + if (UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + mp_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + invert_limb (divisor_limb_inverted, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (dummy, r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + udiv_qrnnd_preinv (dummy, r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else + { + mp_limb_t divisor_limb_inverted; + + invert_limb (divisor_limb_inverted, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + i--; + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (dummy, r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else + { + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (dummy, r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (dummy, r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + it, or because DIVISOR_LIMB is already normalized. */ + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + i--; + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (dummy, r, r, n0, divisor_limb); + } + return r; + } +} diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c new file mode 100644 index 0000000000..62aaa94b92 --- /dev/null +++ b/rts/gmp/mpn/generic/mod_1_rs.c @@ -0,0 +1,111 @@ +/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift. + + THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE + INTERFACE. IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. + IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP + RELEASE. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be + changed to 1 temporarily to test the code under that case too. */ +#if 0 +#undef UDIV_NEEDS_NORMALIZATION +#define UDIV_NEEDS_NORMALIZATION 1 +#endif + + +/* Calculate the remainder "(ptr,size >> shift) % divisor". Note ptr,size + is unchanged, the shift is only for its effect on the remainder. + The shift doesn't even need to be considered until the last limb. + + This function has the normal size!=0 restriction, unlike the basic + mpn_mod_1. */ + +mp_limb_t +#if __STDC__ +mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift, + mp_limb_t divisor) +#else +mpn_mod_1_rshift (ptr, size, shift, divisor) + mp_srcptr ptr; + mp_size_t size; + unsigned shift; + mp_limb_t divisor; +#endif +{ + mp_limb_t quot, rem; + + ASSERT (shift >= 1); + ASSERT (shift < BITS_PER_MP_LIMB); + ASSERT (size >= 1); + + if (size == 1) + return (ptr[0] >> shift) % divisor; + +#if UDIV_NEEDS_NORMALIZATION + { + int norm; + int delta; + + count_leading_zeros (norm, divisor); + divisor <<= norm; + + delta = shift - norm; + if (delta == 0) + return mpn_mod_1 (ptr, size, divisor) >> norm; + + if (delta > 0) + { + rem = mpn_mod_1 (ptr+1, size-1, divisor); + udiv_qrnnd (quot, rem, + rem >> delta, + (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta), + divisor); + return rem >> norm; + } + else + { + rem = mpn_mod_1 (ptr, size, divisor); + udiv_qrnnd (quot, rem, + rem >> (BITS_PER_MP_LIMB+delta), + rem << -delta, + divisor); + return rem >> norm; + } + } + +#else /* !UDIV_NEEDS_NORMALIZATION */ + + rem = mpn_mod_1 (ptr+1, size-1, divisor); + udiv_qrnnd (quot, rem, + rem >> shift, + (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift), + divisor); + return rem; + +#endif +} diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c new file mode 100644 index 0000000000..cecfa19ca1 --- /dev/null +++ b/rts/gmp/mpn/generic/mul.c @@ -0,0 +1,190 @@ +/* mpn_mul -- Multiply two natural numbers. + + THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul) + ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH + THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v + (pointed to by VP, with VN limbs), and store the result at PRODP. The + result is UN + VN limbs. Return the most significant limb of the result. + + NOTE: The space pointed to by PRODP is overwritten before finished with U + and V, so overlap is an error. + + Argument constraints: + 1. UN >= VN. + 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from + the multiplier and the multiplicand. */ + +void +#if __STDC__ +mpn_sqr_n (mp_ptr prodp, + mp_srcptr up, mp_size_t un) +#else +mpn_sqr_n (prodp, up, un) + mp_ptr prodp; + mp_srcptr up; + mp_size_t un; +#endif +{ + if (un < KARATSUBA_SQR_THRESHOLD) + { /* plain schoolbook multiplication */ + if (un == 0) + return; + mpn_sqr_basecase (prodp, up, un); + } + else if (un < TOOM3_SQR_THRESHOLD) + { /* karatsuba multiplication */ + mp_ptr tspace; + TMP_DECL (marker); + TMP_MARK (marker); + tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB); + mpn_kara_sqr_n (prodp, up, un, tspace); + TMP_FREE (marker); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else if (un < FFT_SQR_THRESHOLD) +#else + else +#endif + { /* toom3 multiplication */ + mp_ptr tspace; + TMP_DECL (marker); + TMP_MARK (marker); + tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB); + mpn_toom3_sqr_n (prodp, up, un, tspace); + TMP_FREE (marker); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else + { + /* schoenhage multiplication */ + mpn_mul_fft_full (prodp, up, un, up, un); + } +#endif +} + +mp_limb_t +#if __STDC__ +mpn_mul (mp_ptr prodp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +#else +mpn_mul (prodp, up, un, vp, vn) + mp_ptr prodp; + mp_srcptr up; + mp_size_t un; + mp_srcptr vp; + mp_size_t vn; +#endif +{ + mp_size_t l; + mp_limb_t c; + + if (up == vp && un == vn) + { + mpn_sqr_n (prodp, up, un); + return prodp[2 * un - 1]; + } + + if (vn < KARATSUBA_MUL_THRESHOLD) + { /* long multiplication */ + mpn_mul_basecase (prodp, up, un, vp, vn); + return prodp[un + vn - 1]; + } + + mpn_mul_n (prodp, up, vp, vn); + if (un != vn) + { mp_limb_t t; + mp_ptr ws; + TMP_DECL (marker); + TMP_MARK (marker); + + prodp += vn; + l = vn; + up += vn; + un -= vn; + + if (un < vn) + { + /* Swap u's and v's. */ + MPN_SRCPTR_SWAP (up,un, vp,vn); + } + + ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn) + * BYTES_PER_MP_LIMB); + + t = 0; + while (vn >= KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_n (ws, up, vp, vn); + if (l <= 2*vn) + { + t += mpn_add_n (prodp, prodp, ws, l); + if (l != 2*vn) + { + t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); + l = 2*vn; + } + } + else + { + c = mpn_add_n (prodp, prodp, ws, 2*vn); + t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); + } + prodp += vn; + l -= vn; + up += vn; + un -= vn; + if (un < vn) + { + /* Swap u's and v's. */ + MPN_SRCPTR_SWAP (up,un, vp,vn); + } + } + + if (vn) + { + mpn_mul_basecase (ws, up, un, vp, vn); + if (l <= un + vn) + { + t += mpn_add_n (prodp, prodp, ws, l); + if (l != un + vn) + t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); + } + else + { + c = mpn_add_n (prodp, prodp, ws, un + vn); + t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); + } + } + + TMP_FREE (marker); + } + return prodp[un + vn - 1]; +} diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c new file mode 100644 index 0000000000..1c36b5fb1f --- /dev/null +++ b/rts/gmp/mpn/generic/mul_1.c @@ -0,0 +1,59 @@ +/* mpn_mul_1 -- Multiply a limb vector with a single limb and + store the product in a second limb vector. + +Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + mp_size_t s1_size; + register mp_limb_t s2_limb; +{ + register mp_limb_t cy_limb; + register mp_size_t j; + register mp_limb_t prod_high, prod_low; + + /* The loop counter and index J goes from -S1_SIZE to -1. This way + the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + res_ptr -= j; + + cy_limb = 0; + do + { + umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb) + prod_high; + + res_ptr[j] = prod_low; + } + while (++j != 0); + + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c new file mode 100644 index 0000000000..00c06aa5c4 --- /dev/null +++ b/rts/gmp/mpn/generic/mul_basecase.c @@ -0,0 +1,87 @@ +/* mpn_mul_basecase -- Internal routine to multiply two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Handle simple cases with traditional multiplication. + + This is the most critical code of multiplication. All multiplies rely on + this, both small and huge. Small ones arrive here immediately, huge ones + arrive here as this is the base case for Karatsuba's recursive algorithm. */ + +void +#if __STDC__ +mpn_mul_basecase (mp_ptr prodp, + mp_srcptr up, mp_size_t usize, + mp_srcptr vp, mp_size_t vsize) +#else +mpn_mul_basecase (prodp, up, usize, vp, vsize) + mp_ptr prodp; + mp_srcptr up; + mp_size_t usize; + mp_srcptr vp; + mp_size_t vsize; +#endif +{ + /* We first multiply by the low order one or two limbs, as the result can + be stored, not added, to PROD. We also avoid a loop for zeroing this + way. */ +#if HAVE_NATIVE_mpn_mul_2 + if (vsize >= 2) + { + prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]); + prodp += 2, vp += 2, vsize -= 2; + } + else + { + prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]); + return; + } +#else + prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]); + prodp += 1, vp += 1, vsize -= 1; +#endif + +#if HAVE_NATIVE_mpn_addmul_2 + while (vsize >= 2) + { + prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]); + prodp += 2, vp += 2, vsize -= 2; + } + if (vsize != 0) + prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]); +#else + /* For each iteration in the loop, multiply U with one limb from V, and + add the result to PROD. */ + while (vsize != 0) + { + prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]); + prodp += 1, vp += 1, vsize -= 1; + } +#endif +} diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c new file mode 100644 index 0000000000..00fd6d72de --- /dev/null +++ b/rts/gmp/mpn/generic/mul_fft.c @@ -0,0 +1,772 @@ +/* An implementation in GMP of Scho"nhage's fast multiplication algorithm + modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998. + + THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE + MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED + INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN + A FUTURE GNU MP RELEASE. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +/* References: + + Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker + Strassen, Computing 7, p. 281-292, 1971. + + Asymptotically fast algorithms for the numerical multiplication + and division of polynomials with complex coefficients, by Arnold Scho"nhage, + Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. + + Tapes versus Pointers, a study in implementing fast algorithms, + by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986. + + See also http://www.loria.fr/~zimmerma/bignum + + + Future: + + K==2 isn't needed in the current uses of this code and the bits specific + for that could be dropped. + + It might be possible to avoid a small number of MPN_COPYs by using a + rotating temporary or two. + + Multiplications of unequal sized operands can be done with this code, but + it needs a tighter test for identifying squaring (same sizes as well as + same pointers). */ + + +#include <stdio.h> +#include "gmp.h" +#include "gmp-impl.h" + + +/* Change this to "#define TRACE(x) x" for some traces. */ +#define TRACE(x) + + + +FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = { + FFT_MUL_TABLE, + FFT_SQR_TABLE +}; + + +static void mpn_mul_fft_internal +_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl, + int k, int K, + mp_limb_t **Ap, mp_limb_t **Bp, + mp_limb_t *A, mp_limb_t *B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l, + mp_limb_t *T, int rec)); + + +/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT. + sqr==0 if for a multiply, sqr==1 for a square */ +int +#if __STDC__ +mpn_fft_best_k (mp_size_t n, int sqr) +#else +mpn_fft_best_k (n, sqr) + mp_size_t n; + int sqr; +#endif +{ + mp_size_t t; + int i; + + for (i = 0; mpn_fft_table[sqr][i] != 0; i++) + if (n < mpn_fft_table[sqr][i]) + return i + FFT_FIRST_K; + + /* treat 4*last as one further entry */ + if (i == 0 || n < 4*mpn_fft_table[sqr][i-1]) + return i + FFT_FIRST_K; + else + return i + FFT_FIRST_K + 1; +} + + +/* Returns smallest possible number of limbs >= pl for a fft of size 2^k. + FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */ + +mp_size_t +#if __STDC__ +mpn_fft_next_size (mp_size_t pl, int k) +#else +mpn_fft_next_size (pl, k) + mp_size_t pl; + int k; +#endif +{ + mp_size_t N, M; + int K; + + /* if (k==0) k = mpn_fft_best_k (pl, sqr); */ + N = pl*BITS_PER_MP_LIMB; + K = 1<<k; + if (N%K) N=(N/K+1)*K; + M = N/K; + if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K; + return (N/BITS_PER_MP_LIMB); +} + + +static void +#if __STDC__ +mpn_fft_initl(int **l, int k) +#else +mpn_fft_initl(l, k) + int **l; + int k; +#endif +{ + int i,j,K; + + l[0][0] = 0; + for (i=1,K=2;i<=k;i++,K*=2) { + for (j=0;j<K/2;j++) { + l[i][j] = 2*l[i-1][j]; + l[i][K/2+j] = 1+l[i][j]; + } + } +} + + +/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n) +#else +mpn_fft_neg_modF(ap, n) + mp_limb_t *ap; + mp_size_t n; +#endif +{ + mp_limb_t c; + + c = ap[n]+2; + mpn_com_n (ap, ap, n); + ap[n]=0; mpn_incr_u(ap, c); +} + + +/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp) +#else +mpn_fft_mul_2exp_modF(ap, e, n, tp) + mp_limb_t *ap; + int e; + mp_size_t n; + mp_limb_t *tp; +#endif +{ + int d, sh, i; mp_limb_t cc; + + d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */ + sh = d % BITS_PER_MP_LIMB; + if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */ + else MPN_COPY(tp, ap, n+1); + d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */ + if (d) { + /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */ + /* mpn_xor would be more efficient here */ + for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i]; + cc = 1-mpn_add_1(ap, ap, d, 1); + if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1); + else MPN_COPY(ap+d, tp, n-d); + if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n])) + ap[n]=mpn_add_1(ap, ap, n, cc); + else ap[n]=0; + } + else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) { + ap[n]=mpn_add_1(ap, ap, n, 1); + } + if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n); +} + + +/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n) +#else +mpn_fft_add_modF (ap, bp, n) + mp_limb_t *ap,*bp; + int n; +#endif +{ + mp_limb_t c; + + c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n); + if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1); + ap[n]=c; +} + + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*BITS_PER_MP_LIMB + 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +#if __STDC__ +mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp) +#else +mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp) +mp_limb_t **Ap,*tp; +mp_size_t K,omega,n,inc; +int **ll; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1); + if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1)) +#endif + Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1); + } + else { + int j, inc2=2*inc; + int *lk = *ll; + mp_limb_t *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp); + mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) { + MPN_COPY(tp, Ap[inc], n+1); + mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp); + mpn_fft_add_modF(Ap[inc], Ap[0], n); + mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp); + mpn_fft_add_modF(Ap[0], tp, n); + } + TMP_FREE(marker); + } +} + + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*BITS_PER_MP_LIMB + 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +#if __STDC__ +mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp) +#else +mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp) + mp_limb_t **Ap,**Bp,*tp; + mp_size_t K,omega,n,inc; + int **ll; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1); + if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1)) +#endif + Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1); +#ifdef ADDSUB + if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1) +#else + MPN_COPY(tp, Bp[0], n+1); + mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1); + if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1)) +#endif + Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1); + } + else { + int j, inc2=2*inc; + int *lk=*ll; + mp_limb_t *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp); + mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) { + MPN_COPY(tp, Ap[inc], n+1); + mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp); + mpn_fft_add_modF(Ap[inc], Ap[0], n); + mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp); + mpn_fft_add_modF(Ap[0], tp, n); + MPN_COPY(tp, Bp[inc], n+1); + mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp); + mpn_fft_add_modF(Bp[inc], Bp[0], n); + mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp); + mpn_fft_add_modF(Bp[0], tp, n); + } + TMP_FREE(marker); + } +} + + +/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */ +static void +#if __STDC__ +mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K) +#else +mpn_fft_mul_modF_K(ap, bp, n, K) + mp_limb_t **ap, **bp; + mp_size_t n; + int K; +#endif +{ + int i; + int sqr = (ap == bp); + TMP_DECL(marker); + + TMP_MARK(marker); + + if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) { + int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2; + int **_fft_l; + mp_limb_t **Ap,**Bp,*A,*B,*T; + + k = mpn_fft_best_k (n, sqr); + K2 = 1<<k; + maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB; + M2 = n*BITS_PER_MP_LIMB/K2; + l = n/K2; + Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/ + nprime2 = Nprime2/BITS_PER_MP_LIMB; + Mp2 = Nprime2/K2; + + Ap = TMP_ALLOC_MP_PTRS (K2); + Bp = TMP_ALLOC_MP_PTRS (K2); + A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1)); + T = TMP_ALLOC_LIMBS (nprime2+1); + B = A + K2*(nprime2+1); + _fft_l = TMP_ALLOC_TYPE (k+1, int*); + for (i=0;i<=k;i++) + _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int); + mpn_fft_initl(_fft_l, k); + + TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n, + n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); + + for (i=0;i<K;i++,ap++,bp++) + mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2, + l, Mp2, _fft_l, T, 1); + } + else { + mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n; + tp = TMP_ALLOC_LIMBS (n2); + tpn = tp+n; + TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n)); + for (i=0;i<K;i++) { + a = *ap++; b=*bp++; + if (sqr) + mpn_sqr_n(tp, a, n); + else + mpn_mul_n(tp, b, a, n); + if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0; + if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n]; + if (cc) { + cc = mpn_add_1(tp, tp, n2, cc); + ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc)); + } + a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1); + } + } + TMP_FREE(marker); +} + + +/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]] + output: K*A[0] K*A[K-1] ... K*A[1] */ + +static void +#if __STDC__ +mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n, + mp_limb_t *tp) +#else +mpn_fft_fftinv(Ap,K,omega,n,tp) + mp_limb_t **Ap, *tp; + int K; + mp_size_t omega, n; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[1], n+1); + if (mpn_sub_n(Ap[1], tp, Ap[1], n+1)) +#endif + Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1); + } + else { + int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fftinv(Ap, K2, 2*omega, n, tp); + mpn_fft_fftinv(Bp, K2, 2*omega, n, tp); + /* A[j] <- A[j] + omega^j A[j+K/2] + A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */ + for (j=0;j<K2;j++,Ap++,Bp++) { + MPN_COPY(tp, Bp[0], n+1); + mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp); + mpn_fft_add_modF(Bp[0], Ap[0], n); + mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp); + mpn_fft_add_modF(Ap[0], tp, n); + } + TMP_FREE(marker); + } +} + + +/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp) +#else +mpn_fft_div_2exp_modF(ap,k,n,tp) + mp_limb_t *ap,*tp; + int k; + mp_size_t n; +#endif +{ + int i; + + i = 2*n*BITS_PER_MP_LIMB; + i = (i-k) % i; + mpn_fft_mul_2exp_modF(ap,i,n,tp); + /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */ + /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */ + if (ap[n]==1) { + for (i=0;i<n && ap[i]==0;i++); + if (i<n) { + ap[n]=0; + mpn_sub_1(ap, ap, n, 1); + } + } +} + + +/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */ +static void +#if __STDC__ +mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an) +#else +mpn_fft_norm_modF(rp, ap, n, an) + mp_limb_t *rp; + mp_limb_t *ap; + mp_size_t n; + mp_size_t an; +#endif +{ + mp_size_t l; + + if (an>2*n) { + l = n; + rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an, + mpn_add_n(rp,ap,ap+2*n,an-2*n)); + } + else { + l = an-n; + MPN_COPY(rp, ap, n); + rp[n]=0; + } + if (mpn_sub_n(rp,rp,ap+n,l)) { + if (mpn_sub_1(rp+l,rp+l,n+1-l,1)) + rp[n]=mpn_add_1(rp,rp,n,1); + } +} + + +static void +#if __STDC__ +mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl, + int k, int K, + mp_limb_t **Ap, mp_limb_t **Bp, + mp_limb_t *A, mp_limb_t *B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, + int **_fft_l, + mp_limb_t *T, int rec) +#else +mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec) + mp_limb_t *op; + mp_srcptr n, m; + mp_limb_t **Ap,**Bp,*A,*B,*T; + mp_size_t pl,nprime; + int **_fft_l; + int k,K,l,Mp,rec; +#endif +{ + int i, sqr, pla, lo, sh, j; + mp_limb_t *p; + + sqr = (n==m); + + TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n", + pl,k,K,nprime,l,Mp,rec,sqr)); + + /* decomposition of inputs into arrays Ap[i] and Bp[i] */ + if (rec) for (i=0;i<K;i++) { + Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1); + /* store the next M bits of n into A[i] */ + /* supposes that M is a multiple of BITS_PER_MP_LIMB */ + MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l); + /* set most significant bits of n and m (important in recursive calls) */ + if (i==K-1) Ap[i][l]=n[0]; + mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T); + if (!sqr) { + MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l); + if (i==K-1) Bp[i][l]=m[0]; + mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T); + } + } + + /* direct fft's */ + if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T); + else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T); + + /* term to term multiplications */ + mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K); + + /* inverse fft's */ + mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T); + + /* division of terms after inverse fft */ + for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T); + + /* addition of terms in result p */ + MPN_ZERO(T,nprime+1); + pla = l*(K-1)+nprime+1; /* number of required limbs for p */ + p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */ + MPN_ZERO(p, pla); + sqr=0; /* will accumulate the (signed) carry at p[pla] */ + for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) { + mp_ptr n = p+sh; + j = (K-i)%K; + if (mpn_add_n(n,n,Ap[j],nprime+1)) + sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1); + T[2*l]=i+1; /* T = (i+1)*2^(2*M) */ + if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */ + sqr -= mpn_sub_1(n,n,pla-sh,1); + sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1); + } + } + if (sqr==-1) { + if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) { + /* p[pla-pl]...p[pla-1] are all zero */ + mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1); + mpn_sub_1(p+pla-1,p+pla-1,1,1); + } + } + else if (sqr==1) { + if (pla>=2*pl) + while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr))); + else { + sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr); + ASSERT (sqr == 0); + } + } + else + ASSERT (sqr == 0); + + /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] + < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] + < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ + mpn_fft_norm_modF(op,p,pl,pla); +} + + +/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB + n and m have respectively nl and ml limbs + op must have space for pl+1 limbs + One must have pl = mpn_fft_next_size(pl, k). +*/ + +void +#if __STDC__ +mpn_mul_fft (mp_ptr op, mp_size_t pl, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml, + int k) +#else +mpn_mul_fft (op, pl, n, nl, m, ml, k) + mp_ptr op; + mp_size_t pl; + mp_srcptr n; + mp_size_t nl; + mp_srcptr m; + mp_size_t ml; + int k; +#endif +{ + int K,maxLK,i,j; + mp_size_t N,Nprime,nprime,M,Mp,l; + mp_limb_t **Ap,**Bp,*A,*T,*B; + int **_fft_l; + int sqr = (n==m && nl==ml); + TMP_DECL(marker); + + TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", + pl, nl, ml, k)); + ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl); + + TMP_MARK(marker); + N = pl*BITS_PER_MP_LIMB; + _fft_l = TMP_ALLOC_TYPE (k+1, int*); + for (i=0;i<=k;i++) + _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int); + mpn_fft_initl(_fft_l, k); + K = 1<<k; + M = N/K; /* N = 2^k M */ + l = M/BITS_PER_MP_LIMB; + maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB; + + Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */ + nprime = Nprime/BITS_PER_MP_LIMB; + TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n", + N, K, M, l, maxLK, Nprime, nprime)); + if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) { + maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB; + if (Nprime % maxLK) { + Nprime=((Nprime/maxLK)+1)*maxLK; + nprime = Nprime/BITS_PER_MP_LIMB; + } + TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime)); + } + + T = TMP_ALLOC_LIMBS (nprime+1); + Mp = Nprime/K; + + TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n", + pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K); + printf(" temp space %ld\n", 2*K*(nprime+1))); + + A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1)); + B = A+K*(nprime+1); + Ap = TMP_ALLOC_MP_PTRS (K); + Bp = TMP_ALLOC_MP_PTRS (K); + /* special decomposition for main call */ + for (i=0;i<K;i++) { + Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1); + /* store the next M bits of n into A[i] */ + /* supposes that M is a multiple of BITS_PER_MP_LIMB */ + if (nl>0) { + j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */ + MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j); + mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T); + } + else MPN_ZERO(Ap[i], nprime+1); + nl -= l; + if (n!=m) { + if (ml>0) { + j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */ + MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j); + mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T); + } + else MPN_ZERO(Bp[i], nprime+1); + } + ml -= l; + } + mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0); + TMP_FREE(marker); + _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1)); +} + + +#if WANT_ASSERT +static int +#if __STDC__ +mpn_zero_p (mp_ptr p, mp_size_t n) +#else + mpn_zero_p (p, n) + mp_ptr p; + mp_size_t n; +#endif +{ + mp_size_t i; + + for (i = 0; i < n; i++) + { + if (p[i] != 0) + return 0; + } + + return 1; +} +#endif + + +/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}. + + FIXME: Duplicating the result like this is wasteful, do something better + perhaps at the norm_modF stage above. */ + +void +#if __STDC__ +mpn_mul_fft_full (mp_ptr op, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml) +#else +mpn_mul_fft_full (op, n, nl, m, ml) + mp_ptr op; + mp_srcptr n; + mp_size_t nl; + mp_srcptr m; + mp_size_t ml; +#endif +{ + mp_ptr pad_op; + mp_size_t pl; + int k; + int sqr = (n==m && nl==ml); + + k = mpn_fft_best_k (nl+ml, sqr); + pl = mpn_fft_next_size (nl+ml, k); + + TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n", + nl, ml, pl, k)); + + pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1); + mpn_mul_fft (pad_op, pl, n, nl, m, ml, k); + + ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml))); + MPN_COPY (op, pad_op, nl+ml); + + _MP_FREE_FUNC_LIMBS (pad_op, pl+1); +} diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c new file mode 100644 index 0000000000..b7563be2d3 --- /dev/null +++ b/rts/gmp/mpn/generic/mul_n.c @@ -0,0 +1,1343 @@ +/* mpn_mul_n and helper function -- Multiply/square natural numbers. + + THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) + ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH + THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. + 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ +#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) + +#if !defined (__alpha) && !defined (__mips) +/* For all other machines, we want to call mpn functions for the compund + operations instead of open-coding them. */ +#define USE_MORE_MPN +#endif + +/*== Function declarations =================================================*/ + +static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, mp_srcptr, mp_srcptr, + mp_size_t, mp_size_t)); +static void interpolate3 _PROTO ((mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_size_t, mp_size_t)); +static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); + + +/*-- mpn_kara_mul_n ---------------------------------------------------------------*/ + +/* Multiplies using 3 half-sized mults and so on recursively. + * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. + * No overlap of p[...] with a[...] or b[...]. + * ws is workspace. + */ + +void +#if __STDC__ +mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) +#else +mpn_kara_mul_n(p, a, b, n, ws) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t i, sign, w, w0, w1; + mp_size_t n2; + mp_srcptr x, y; + + n2 = n >> 1; + ASSERT (n2 > 0); + + if (n & 1) + { + /* Odd length. */ + mp_size_t n1, n3, nm1; + + n3 = n - n2; + + sign = 0; + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p, x, y, n2); + } + p[n2] = w; + + w = b[n2]; + if (w != 0) + w -= mpn_sub_n (p + n3, b, b + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = b[i]; + w1 = b[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = b + n3; + y = b; + sign ^= 1; + } + else + { + x = b; + y = b + n3; + } + mpn_sub_n (p + n3, x, y, n2); + } + p[n] = w; + + n1 = n + 1; + if (n2 < KARATSUBA_MUL_THRESHOLD) + { + if (n3 < KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_basecase (ws, p, n3, p + n3, n3); + mpn_mul_basecase (p, a, n3, b, n3); + } + else + { + mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); + mpn_kara_mul_n (p, a, b, n3, ws + n1); + } + mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); + } + else + { + mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); + mpn_kara_mul_n (p, a, b, n3, ws + n1); + mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); + } + + if (sign) + mpn_add_n (ws, p, ws, n1); + else + mpn_sub_n (ws, p, ws, n1); + + nm1 = n - 1; + if (mpn_add_n (ws, p + n1, ws, nm1)) + { + mp_limb_t x = ws[nm1] + 1; + ws[nm1] = x; + if (x == 0) + ++ws[n]; + } + if (mpn_add_n (p + n3, p + n3, ws, n1)) + { + mp_limb_t x; + i = n1 + n3; + do + { + x = p[i] + 1; + p[i] = x; + ++i; + } while (x == 0); + } + } + else + { + /* Even length. */ + mp_limb_t t; + + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + sign = 0; + if (w0 < w1) + { + x = a + n2; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p, x, y, n2); + + i = n2; + do + { + --i; + w0 = b[i]; + w1 = b[n2+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = b + n2; + y = b; + sign ^= 1; + } + else + { + x = b; + y = b + n2; + } + mpn_sub_n (p + n2, x, y, n2); + + /* Pointwise products. */ + if (n2 < KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_basecase (ws, p, n2, p + n2, n2); + mpn_mul_basecase (p, a, n2, b, n2); + mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); + } + else + { + mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); + mpn_kara_mul_n (p, a, b, n2, ws + n); + mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); + } + + /* Interpolate. */ + if (sign) + w = mpn_add_n (ws, p, ws, n); + else + w = -mpn_sub_n (ws, p, ws, n); + w += mpn_add_n (ws, p + n, ws, n); + w += mpn_add_n (p + n2, p + n2, ws, n); + /* TO DO: could put "if (w) { ... }" here. + * Less work but badly predicted branch. + * No measurable difference in speed on Alpha. + */ + i = n + n2; + t = p[i] + w; + p[i] = t; + if (t < w) + { + do + { + ++i; + w = p[i] + 1; + p[i] = w; + } + while (w == 0); + } + } +} + +void +#if __STDC__ +mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) +#else +mpn_kara_sqr_n (p, a, n, ws) + mp_ptr p; + mp_srcptr a; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t i, sign, w, w0, w1; + mp_size_t n2; + mp_srcptr x, y; + + n2 = n >> 1; + ASSERT (n2 > 0); + + if (n & 1) + { + /* Odd length. */ + mp_size_t n1, n3, nm1; + + n3 = n - n2; + + sign = 0; + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p, x, y, n2); + } + p[n2] = w; + + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p + n3, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign ^= 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p + n3, x, y, n2); + } + p[n] = w; + + n1 = n + 1; + if (n2 < KARATSUBA_SQR_THRESHOLD) + { + if (n3 < KARATSUBA_SQR_THRESHOLD) + { + mpn_sqr_basecase (ws, p, n3); + mpn_sqr_basecase (p, a, n3); + } + else + { + mpn_kara_sqr_n (ws, p, n3, ws + n1); + mpn_kara_sqr_n (p, a, n3, ws + n1); + } + mpn_sqr_basecase (p + n1, a + n3, n2); + } + else + { + mpn_kara_sqr_n (ws, p, n3, ws + n1); + mpn_kara_sqr_n (p, a, n3, ws + n1); + mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); + } + + if (sign) + mpn_add_n (ws, p, ws, n1); + else + mpn_sub_n (ws, p, ws, n1); + + nm1 = n - 1; + if (mpn_add_n (ws, p + n1, ws, nm1)) + { + mp_limb_t x = ws[nm1] + 1; + ws[nm1] = x; + if (x == 0) + ++ws[n]; + } + if (mpn_add_n (p + n3, p + n3, ws, n1)) + { + mp_limb_t x; + i = n1 + n3; + do + { + x = p[i] + 1; + p[i] = x; + ++i; + } while (x == 0); + } + } + else + { + /* Even length. */ + mp_limb_t t; + + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + sign = 0; + if (w0 < w1) + { + x = a + n2; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p, x, y, n2); + + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n2; + y = a; + sign ^= 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p + n2, x, y, n2); + + /* Pointwise products. */ + if (n2 < KARATSUBA_SQR_THRESHOLD) + { + mpn_sqr_basecase (ws, p, n2); + mpn_sqr_basecase (p, a, n2); + mpn_sqr_basecase (p + n, a + n2, n2); + } + else + { + mpn_kara_sqr_n (ws, p, n2, ws + n); + mpn_kara_sqr_n (p, a, n2, ws + n); + mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); + } + + /* Interpolate. */ + if (sign) + w = mpn_add_n (ws, p, ws, n); + else + w = -mpn_sub_n (ws, p, ws, n); + w += mpn_add_n (ws, p + n, ws, n); + w += mpn_add_n (p + n2, p + n2, ws, n); + /* TO DO: could put "if (w) { ... }" here. + * Less work but badly predicted branch. + * No measurable difference in speed on Alpha. + */ + i = n + n2; + t = p[i] + w; + p[i] = t; + if (t < w) + { + do + { + ++i; + w = p[i] + 1; + p[i] = w; + } + while (w == 0); + } + } +} + +/*-- add2Times -------------------------------------------------------------*/ + +/* z[] = x[] + 2 * y[] + Note that z and x might point to the same vectors. */ +#ifdef USE_MORE_MPN +static inline mp_limb_t +#if __STDC__ +add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) +#else +add2Times (z, x, y, n) + mp_ptr z; + mp_srcptr x; + mp_srcptr y; + mp_size_t n; +#endif +{ + mp_ptr t; + mp_limb_t c; + TMP_DECL (marker); + TMP_MARK (marker); + t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + c = mpn_lshift (t, y, n, 1); + c += mpn_add_n (z, x, t, n); + TMP_FREE (marker); + return c; +} +#else + +static mp_limb_t +#if __STDC__ +add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) +#else +add2Times (z, x, y, n) + mp_ptr z; + mp_srcptr x; + mp_srcptr y; + mp_size_t n; +#endif +{ + mp_limb_t c, v, w; + + ASSERT (n > 0); + v = *x; w = *y; + c = w >> (BITS_PER_MP_LIMB - 1); + w <<= 1; + v += w; + c += v < w; + *z = v; + ++x; ++y; ++z; + while (--n) + { + v = *x; + w = *y; + v += c; + c = v < c; + c += w >> (BITS_PER_MP_LIMB - 1); + w <<= 1; + v += w; + c += v < w; + *z = v; + ++x; ++y; ++z; + } + + return c; +} +#endif + +/*-- evaluate3 -------------------------------------------------------------*/ + +/* Evaluates: + * ph := 4*A+2*B+C + * p1 := A+B+C + * p2 := A+2*B+4*C + * where: + * ph[], p1[], p2[], A[] and B[] all have length len, + * C[] has length len2 with len-len2 = 0, 1 or 2. + * Returns top words (overflow) at pth, pt1 and pt2 respectively. + */ +#ifdef USE_MORE_MPN +static void +#if __STDC__ +evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2) +#else +evaluate3 (ph, p1, p2, pth, pt1, pt2, + A, B, C, len, len2) + mp_ptr ph; + mp_ptr p1; + mp_ptr p2; + mp_ptr pth; + mp_ptr pt1; + mp_ptr pt2; + mp_srcptr A; + mp_srcptr B; + mp_srcptr C; + mp_size_t len; + mp_size_t len2; +#endif +{ + mp_limb_t c, d, e; + + ASSERT (len - len2 <= 2); + + e = mpn_lshift (p1, B, len, 1); + + c = mpn_lshift (ph, A, len, 2); + c += e + mpn_add_n (ph, ph, p1, len); + d = mpn_add_n (ph, ph, C, len2); + if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); + ASSERT (c < 7); + *pth = c; + + c = mpn_lshift (p2, C, len2, 2); +#if 1 + if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; } + c += e + mpn_add_n (p2, p2, p1, len); +#else + d = mpn_add_n (p2, p2, p1, len2); + c += d; + if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); + c += e; +#endif + c += mpn_add_n (p2, p2, A, len); + ASSERT (c < 7); + *pt2 = c; + + c = mpn_add_n (p1, A, B, len); + d = mpn_add_n (p1, p1, C, len2); + if (len2 == len) c += d; + else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); + ASSERT (c < 3); + *pt1 = c; + +} + +#else + +static void +#if __STDC__ +evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) +#else +evaluate3 (ph, p1, p2, pth, pt1, pt2, + A, B, C, l, ls) + mp_ptr ph; + mp_ptr p1; + mp_ptr p2; + mp_ptr pth; + mp_ptr pt1; + mp_ptr pt2; + mp_srcptr A; + mp_srcptr B; + mp_srcptr C; + mp_size_t l; + mp_size_t ls; +#endif +{ + mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; + + ASSERT (l - ls <= 2); + + th = t1 = t2 = 0; + for (i = 0; i < l; ++i) + { + a = *A; + b = *B; + c = i < ls ? *C : 0; + + /* TO DO: choose one of the following alternatives. */ +#if 0 + t = a << 2; + vh = th + t; + th = vh < t; + th += a >> (BITS_PER_MP_LIMB - 2); + t = b << 1; + vh += t; + th += vh < t; + th += b >> (BITS_PER_MP_LIMB - 1); + vh += c; + th += vh < c; +#else + vh = th + c; + th = vh < c; + t = b << 1; + vh += t; + th += vh < t; + th += b >> (BITS_PER_MP_LIMB - 1); + t = a << 2; + vh += t; + th += vh < t; + th += a >> (BITS_PER_MP_LIMB - 2); +#endif + + v1 = t1 + a; + t1 = v1 < a; + v1 += b; + t1 += v1 < b; + v1 += c; + t1 += v1 < c; + + v2 = t2 + a; + t2 = v2 < a; + t = b << 1; + v2 += t; + t2 += v2 < t; + t2 += b >> (BITS_PER_MP_LIMB - 1); + t = c << 2; + v2 += t; + t2 += v2 < t; + t2 += c >> (BITS_PER_MP_LIMB - 2); + + *ph = vh; + *p1 = v1; + *p2 = v2; + + ++A; ++B; ++C; + ++ph; ++p1; ++p2; + } + + ASSERT (th < 7); + ASSERT (t1 < 3); + ASSERT (t2 < 7); + + *pth = th; + *pt1 = t1; + *pt2 = t2; +} +#endif + + +/*-- interpolate3 ----------------------------------------------------------*/ + +/* Interpolates B, C, D (in-place) from: + * 16*A+8*B+4*C+2*D+E + * A+B+C+D+E + * A+2*B+4*C+8*D+16*E + * where: + * A[], B[], C[] and D[] all have length l, + * E[] has length ls with l-ls = 0, 2 or 4. + * + * Reads top words (from earlier overflow) from ptb, ptc and ptd, + * and returns new top words there. + */ + +#ifdef USE_MORE_MPN +static void +#if __STDC__ +interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2) +#else +interpolate3 (A, B, C, D, E, + ptb, ptc, ptd, len, len2) + mp_srcptr A; + mp_ptr B; + mp_ptr C; + mp_ptr D; + mp_srcptr E; + mp_ptr ptb; + mp_ptr ptc; + mp_ptr ptd; + mp_size_t len; + mp_size_t len2; +#endif +{ + mp_ptr ws; + mp_limb_t t, tb,tc,td; + TMP_DECL (marker); + TMP_MARK (marker); + + ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); + + /* Let x1, x2, x3 be the values to interpolate. We have: + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + */ + + ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); + + tb = *ptb; tc = *ptc; td = *ptd; + + + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e + */ + + t = mpn_lshift (ws, A, len, 4); + tb -= t + mpn_sub_n (B, B, ws, len); + t = mpn_sub_n (B, B, E, len2); + if (len2 == len) tb -= t; + else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); + + tc -= mpn_sub_n (C, C, A, len); + t = mpn_sub_n (C, C, E, len2); + if (len2 == len) tc -= t; + else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); + + t = mpn_lshift (ws, E, len2, 4); + t += mpn_add_n (ws, ws, A, len2); +#if 1 + if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); + td -= t + mpn_sub_n (D, D, ws, len); +#else + t += mpn_sub_n (D, D, ws, len2); + if (len2 != len) { + t = mpn_sub_1 (D+len2, D+len2, len-len2, t); + t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); + } /* end if/else */ + td -= t; +#endif + + + /* b, d := b + d, b - d */ + +#ifdef HAVE_MPN_ADD_SUB_N + /* #error TO DO ... */ +#else + t = tb + td + mpn_add_n (ws, B, D, len); + td = tb - td - mpn_sub_n (D, B, D, len); + tb = t; + MPN_COPY (B, ws, len); +#endif + + /* b := b-8*c */ + t = 8 * tc + mpn_lshift (ws, C, len, 3); + tb -= t + mpn_sub_n (B, B, ws, len); + + /* c := 2*c - b */ + tc = 2 * tc + mpn_lshift (C, C, len, 1); + tc -= tb + mpn_sub_n (C, C, B, len); + + /* d := d/3 */ + td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3; + + /* b, d := b + d, b - d */ +#ifdef HAVE_MPN_ADD_SUB_N + /* #error TO DO ... */ +#else + t = tb + td + mpn_add_n (ws, B, D, len); + td = tb - td - mpn_sub_n (D, B, D, len); + tb = t; + MPN_COPY (B, ws, len); +#endif + + /* Now: + * b = 4*x1 + * c = 2*x2 + * d = 4*x3 + */ + + ASSERT(!(*B & 3)); + mpn_rshift (B, B, len, 2); + B[len-1] |= tb<<(BITS_PER_MP_LIMB-2); + ASSERT((long)tb >= 0); + tb >>= 2; + + ASSERT(!(*C & 1)); + mpn_rshift (C, C, len, 1); + C[len-1] |= tc<<(BITS_PER_MP_LIMB-1); + ASSERT((long)tc >= 0); + tc >>= 1; + + ASSERT(!(*D & 3)); + mpn_rshift (D, D, len, 2); + D[len-1] |= td<<(BITS_PER_MP_LIMB-2); + ASSERT((long)td >= 0); + td >>= 2; + +#if WANT_ASSERT + ASSERT (tb < 2); + if (len == len2) + { + ASSERT (tc < 3); + ASSERT (td < 2); + } + else + { + ASSERT (tc < 2); + ASSERT (!td); + } +#endif + + *ptb = tb; + *ptc = tc; + *ptd = td; + + TMP_FREE (marker); +} + +#else + +static void +#if __STDC__ +interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) +#else +interpolate3 (A, B, C, D, E, + ptb, ptc, ptd, l, ls) + mp_srcptr A; + mp_ptr B; + mp_ptr C; + mp_ptr D; + mp_srcptr E; + mp_ptr ptb; + mp_ptr ptc; + mp_ptr ptd; + mp_size_t l; + mp_size_t ls; +#endif +{ + mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; + const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); + +#if WANT_ASSERT + t = l - ls; + ASSERT (t == 0 || t == 2 || t == 4); +#endif + + sb = sc = sd = 0; + for (i = 0; i < l; ++i) + { + mp_limb_t tb, tc, td, tt; + + a = *A; + b = *B; + c = *C; + d = *D; + e = i < ls ? *E : 0; + + /* Let x1, x2, x3 be the values to interpolate. We have: + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + */ + + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e + */ + t = a << 4; + tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t); + b -= t; + tb -= b < e; + b -= e; + tc = -(c < a); + c -= a; + tc -= c < e; + c -= e; + td = -(d < a); + d -= a; + t = e << 4; + td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t); + d -= t; + + /* b, d := b + d, b - d */ + t = b + d; + tt = tb + td + (t < b); + td = tb - td - (b < d); + d = b - d; + b = t; + tb = tt; + + /* b := b-8*c */ + t = c << 3; + tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t); + b -= t; + + /* c := 2*c - b */ + t = c << 1; + tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b); + c = t - b; + + /* d := d/3 */ + d *= INVERSE_3; + td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); + td *= INVERSE_3; + + /* b, d := b + d, b - d */ + t = b + d; + tt = tb + td + (t < b); + td = tb - td - (b < d); + d = b - d; + b = t; + tb = tt; + + /* Now: + * b = 4*x1 + * c = 2*x2 + * d = 4*x3 + */ + + /* sb has period 2. */ + b += sb; + tb += b < sb; + sb &= maskOffHalf; + sb |= sb >> (BITS_PER_MP_LIMB >> 1); + sb += tb; + + /* sc has period 1. */ + c += sc; + tc += c < sc; + /* TO DO: choose one of the following alternatives. */ +#if 1 + sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1)); + sc += tc; +#else + sc = tc - ((long)sc < 0L); +#endif + + /* sd has period 2. */ + d += sd; + td += d < sd; + sd &= maskOffHalf; + sd |= sd >> (BITS_PER_MP_LIMB >> 1); + sd += td; + + if (i != 0) + { + B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); + C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); + D[-1] = od | d << (BITS_PER_MP_LIMB - 2); + } + ob = b >> 2; + oc = c >> 1; + od = d >> 2; + + ++A; ++B; ++C; ++D; ++E; + } + + /* Handle top words. */ + b = *ptb; + c = *ptc; + d = *ptd; + + t = b + d; + d = b - d; + b = t; + b -= c << 3; + c = (c << 1) - b; + d *= INVERSE_3; + t = b + d; + d = b - d; + b = t; + + b += sb; + c += sc; + d += sd; + + B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); + C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); + D[-1] = od | d << (BITS_PER_MP_LIMB - 2); + + b >>= 2; + c >>= 1; + d >>= 2; + +#if WANT_ASSERT + ASSERT (b < 2); + if (l == ls) + { + ASSERT (c < 3); + ASSERT (d < 2); + } + else + { + ASSERT (c < 2); + ASSERT (!d); + } +#endif + + *ptb = b; + *ptc = c; + *ptd = d; +} +#endif + + +/*-- mpn_toom3_mul_n --------------------------------------------------------------*/ + +/* Multiplies using 5 mults of one third size and so on recursively. + * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. + * No overlap of p[...] with a[...] or b[...]. + * ws is workspace. + */ + +/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the + * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() + * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false. + */ + +#define TOOM3_MUL_REC(p, a, b, n, ws) \ + do { \ + if (n < KARATSUBA_MUL_THRESHOLD) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (n < TOOM3_MUL_THRESHOLD) \ + mpn_kara_mul_n (p, a, b, n, ws); \ + else \ + mpn_toom3_mul_n (p, a, b, n, ws); \ + } while (0) + +void +#if __STDC__ +mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) +#else +mpn_toom3_mul_n (p, a, b, n, ws) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; + mp_limb_t *A,*B,*C,*D,*E, *W; + mp_size_t l,l2,l3,l4,l5,ls; + + /* Break n words into chunks of size l, l and ls. + * n = 3*k => l = k, ls = k + * n = 3*k+1 => l = k+1, ls = k-1 + * n = 3*k+2 => l = k+1, ls = k + */ + { + mp_limb_t m; + + ASSERT (n >= TOOM3_MUL_THRESHOLD); + l = ls = n / 3; + m = n - l * 3; + if (m != 0) + ++l; + if (m == 1) + --ls; + + l2 = l * 2; + l3 = l * 3; + l4 = l * 4; + l5 = l * 5; + A = p; + B = ws; + C = p + l2; + D = ws + l2; + E = p + l4; + W = ws + l4; + } + + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ + evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); + evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); + + /** Second stage: pointwise multiplies. **/ + TOOM3_MUL_REC(D, C, C + l, l, W); + tD = cD*dD; + if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD); + if (dD) tD += mpn_addmul_1 (D + l, C, l, dD); + ASSERT (tD < 49); + TOOM3_MUL_REC(C, B, B + l, l, W); + tC = cC*dC; + /* TO DO: choose one of the following alternatives. */ +#if 0 + if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC); + if (dC) tC += mpn_addmul_1 (C + l, B, l, dC); +#else + if (cC) + { + if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l); + else tC += add2Times (C + l, C + l, B + l, l); + } + if (dC) + { + if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l); + else tC += add2Times (C + l, C + l, B, l); + } +#endif + ASSERT (tC < 9); + TOOM3_MUL_REC(B, A, A + l, l, W); + tB = cB*dB; + if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB); + if (dB) tB += mpn_addmul_1 (B + l, A, l, dB); + ASSERT (tB < 49); + TOOM3_MUL_REC(A, a, b, l, W); + TOOM3_MUL_REC(E, a + l2, b + l2, ls, W); + + /** Third stage: interpolation. **/ + interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); + + /** Final stage: add up the coefficients. **/ + { + mp_limb_t i, x, y; + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + mpn_incr_u (p + l3, tB); + mpn_incr_u (p + l4, tC); + mpn_incr_u (p + l5, tD); + } +} + +/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ + +/* Like previous function but for squaring */ + +#define TOOM3_SQR_REC(p, a, n, ws) \ + do { \ + if (n < KARATSUBA_SQR_THRESHOLD) \ + mpn_sqr_basecase (p, a, n); \ + else if (n < TOOM3_SQR_THRESHOLD) \ + mpn_kara_sqr_n (p, a, n, ws); \ + else \ + mpn_toom3_sqr_n (p, a, n, ws); \ + } while (0) + +void +#if __STDC__ +mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) +#else +mpn_toom3_sqr_n (p, a, n, ws) + mp_ptr p; + mp_srcptr a; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t cB,cC,cD, tB,tC,tD; + mp_limb_t *A,*B,*C,*D,*E, *W; + mp_size_t l,l2,l3,l4,l5,ls; + + /* Break n words into chunks of size l, l and ls. + * n = 3*k => l = k, ls = k + * n = 3*k+1 => l = k+1, ls = k-1 + * n = 3*k+2 => l = k+1, ls = k + */ + { + mp_limb_t m; + + ASSERT (n >= TOOM3_MUL_THRESHOLD); + l = ls = n / 3; + m = n - l * 3; + if (m != 0) + ++l; + if (m == 1) + --ls; + + l2 = l * 2; + l3 = l * 3; + l4 = l * 4; + l5 = l * 5; + A = p; + B = ws; + C = p + l2; + D = ws + l2; + E = p + l4; + W = ws + l4; + } + + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ + evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); + + /** Second stage: pointwise multiplies. **/ + TOOM3_SQR_REC(D, C, l, W); + tD = cD*cD; + if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD); + ASSERT (tD < 49); + TOOM3_SQR_REC(C, B, l, W); + tC = cC*cC; + /* TO DO: choose one of the following alternatives. */ +#if 0 + if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC); +#else + if (cC >= 1) + { + tC += add2Times (C + l, C + l, B, l); + if (cC == 2) + tC += add2Times (C + l, C + l, B, l); + } +#endif + ASSERT (tC < 9); + TOOM3_SQR_REC(B, A, l, W); + tB = cB*cB; + if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB); + ASSERT (tB < 49); + TOOM3_SQR_REC(A, a, l, W); + TOOM3_SQR_REC(E, a + l2, ls, W); + + /** Third stage: interpolation. **/ + interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); + + /** Final stage: add up the coefficients. **/ + { + mp_limb_t i, x, y; + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + mpn_incr_u (p + l3, tB); + mpn_incr_u (p + l4, tC); + mpn_incr_u (p + l5, tD); + } +} + +void +#if __STDC__ +mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) +#else +mpn_mul_n (p, a, b, n) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; +#endif +{ + if (n < KARATSUBA_MUL_THRESHOLD) + mpn_mul_basecase (p, a, n, b, n); + else if (n < TOOM3_MUL_THRESHOLD) + { + /* Allocate workspace of fixed size on stack: fast! */ +#if TUNE_PROGRAM_BUILD + mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB]; +#else + mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB]; +#endif + mpn_kara_mul_n (p, a, b, n, ws); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else if (n < FFT_MUL_THRESHOLD) +#else + else +#endif + { + /* Use workspace of unknown size in heap, as stack space may + * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the + * multiplication will take much longer than malloc()/free(). */ + mp_limb_t wsLen, *ws; + wsLen = 2 * n + 3 * BITS_PER_MP_LIMB; + ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t)); + mpn_toom3_mul_n (p, a, b, n, ws); + (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t)); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else + { + mpn_mul_fft_full (p, a, n, b, n); + } +#endif +} diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c new file mode 100644 index 0000000000..42ee3405d7 --- /dev/null +++ b/rts/gmp/mpn/generic/perfsqr.c @@ -0,0 +1,123 @@ +/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, + zero otherwise. + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include <stdio.h> /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue + modulo 0x100. */ +static unsigned char const sq_res_0x100[0x100] = +{ + 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, +}; + +int +#if __STDC__ +mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) +#else +mpn_perfect_square_p (up, usize) + mp_srcptr up; + mp_size_t usize; +#endif +{ + mp_limb_t rem; + mp_ptr root_ptr; + int res; + TMP_DECL (marker); + + /* The first test excludes 55/64 (85.9%) of the perfect square candidates + in O(1) time. */ + if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0) + return 0; + +#if defined (PP) + /* The second test excludes 30652543/30808063 (99.5%) of the remaining + perfect square candidates in O(n) time. */ + + /* Firstly, compute REM = A mod PP. */ + if (UDIV_TIME > (2 * UMUL_TIME + 6)) + rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED); + else + rem = mpn_mod_1 (up, usize, (mp_limb_t) PP); + + /* Now decide if REM is a quadratic residue modulo the factors in PP. */ + + /* If A is just a few limbs, computing the square root does not take long + time, so things might run faster if we limit this loop according to the + size of A. */ + +#if BITS_PER_MP_LIMB == 64 + if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0) + return 0; + if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0) + return 0; + if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0) + return 0; + if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0) + return 0; + if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0) + return 0; + if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0) + return 0; +#endif + if (((0x13D122F3L >> rem % 29) & 1) == 0) + return 0; + if (((0x5335FL >> rem % 23) & 1) == 0) + return 0; + if (((0x30AF3L >> rem % 19) & 1) == 0) + return 0; + if (((0x1A317L >> rem % 17) & 1) == 0) + return 0; + if (((0x161BL >> rem % 13) & 1) == 0) + return 0; + if (((0x23BL >> rem % 11) & 1) == 0) + return 0; + if (((0x017L >> rem % 7) & 1) == 0) + return 0; + if (((0x13L >> rem % 5) & 1) == 0) + return 0; + if (((0x3L >> rem % 3) & 1) == 0) + return 0; +#endif + + TMP_MARK (marker); + + /* For the third and last test, we finally compute the square root, + to make sure we've really got a perfect square. */ + root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB); + + /* Iff mpn_sqrtrem returns zero, the square is perfect. */ + res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); + TMP_FREE (marker); + return res; +} diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c new file mode 100644 index 0000000000..387be9536d --- /dev/null +++ b/rts/gmp/mpn/generic/popcount.c @@ -0,0 +1,93 @@ +/* popcount.c + +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if defined __GNUC__ +/* No processor claiming to be SPARC v9 compliant seem to + implement the POPC instruction. Disable pattern for now. */ +#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 +#define popc_limb(a) \ + ({ \ + DItype __res; \ + asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \ + __res; \ + }) +#endif +#endif + +#ifndef popc_limb + +/* Cool population count of a mp_limb_t. + You have to figure out how this works, I won't tell you! */ + +static inline unsigned int +#if __STDC__ +popc_limb (mp_limb_t x) +#else +popc_limb (x) + mp_limb_t x; +#endif +{ +#if BITS_PER_MP_LIMB == 64 + /* We have to go into some trouble to define these constants. + (For mp_limb_t being `long long'.) */ + mp_limb_t cnst; + cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2); + x -= (x & cnst) >> 1; + cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2); + x = ((x & ~cnst) >> 2) + (x & cnst); + cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2); + x = ((x >> 4) + x) & cnst; + x = ((x >> 8) + x); + x = ((x >> 16) + x); + x = ((x >> 32) + x) & 0xff; +#endif +#if BITS_PER_MP_LIMB == 32 + x -= (x & 0xaaaaaaaa) >> 1; + x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L); + x = ((x >> 4) + x) & 0x0f0f0f0fL; + x = ((x >> 8) + x); + x = ((x >> 16) + x) & 0xff; +#endif + return x; +} +#endif + +unsigned long int +#if __STDC__ +mpn_popcount (register mp_srcptr p, register mp_size_t size) +#else +mpn_popcount (p, size) + register mp_srcptr p; + register mp_size_t size; +#endif +{ + unsigned long int popcnt; + mp_size_t i; + + popcnt = 0; + for (i = 0; i < size; i++) + popcnt += popc_limb (p[i]); + + return popcnt; +} diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c new file mode 100644 index 0000000000..27179683b3 --- /dev/null +++ b/rts/gmp/mpn/generic/pre_mod_1.c @@ -0,0 +1,69 @@ +/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, + divisor_limb_inverted) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB. + DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB + + - 2^BITS_PER_MP_LIMB. + Return the single-limb remainder. + +Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +mp_limb_t +#if __STDC__ +mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size, + mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted) +#else +mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted) + mp_srcptr dividend_ptr; + mp_size_t dividend_size; + mp_limb_t divisor_limb; + mp_limb_t divisor_limb_inverted; +#endif +{ + mp_size_t i; + mp_limb_t n0, r; + int dummy; + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + i--; + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted); + } + return r; +} diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c new file mode 100644 index 0000000000..dea4e20e56 --- /dev/null +++ b/rts/gmp/mpn/generic/random.c @@ -0,0 +1,43 @@ +/* mpn_random -- Generate random numbers. + +Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "urandom.h" + +void +#if __STDC__ +mpn_random (mp_ptr res_ptr, mp_size_t size) +#else +mpn_random (res_ptr, size) + mp_ptr res_ptr; + mp_size_t size; +#endif +{ + mp_size_t i; + + for (i = 0; i < size; i++) + res_ptr[i] = urandom (); + + /* Make sure the most significant limb is non-zero. */ + while (res_ptr[size - 1] == 0) + res_ptr[size - 1] = urandom (); +} diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c new file mode 100644 index 0000000000..86682f81fa --- /dev/null +++ b/rts/gmp/mpn/generic/random2.c @@ -0,0 +1,105 @@ +/* mpn_random2 -- Generate random numbers with relatively long strings + of ones and zeroes. Suitable for border testing. + +Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if defined (__hpux) || defined (__alpha) || defined (__svr4__) || defined (__SVR4) +/* HPUX lacks random(). DEC OSF/1 1.2 random() returns a double. */ +long mrand48 (); +static inline long +random () +{ + return mrand48 (); +} +#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__)) +/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */ +static inline unsigned long +random () +{ + return rand () ^ (rand () << 16) ^ (rand() << 32); +} + +#else +long random (); +#endif + +/* It's a bit tricky to get this right, so please test the code well + if you hack with it. Some early versions of the function produced + random numbers with the leading limb == 0, and some versions never + made the most significant bit set. */ + +void +#if __STDC__ +mpn_random2 (mp_ptr res_ptr, mp_size_t size) +#else +mpn_random2 (res_ptr, size) + mp_ptr res_ptr; + mp_size_t size; +#endif +{ + int n_bits; + int bit_pos; + mp_size_t limb_pos; + unsigned int ran; + mp_limb_t limb; + + limb = 0; + + /* Start off in a random bit position in the most significant limb. */ + bit_pos = random () & (BITS_PER_MP_LIMB - 1); + + /* Least significant bit of RAN chooses string of ones/string of zeroes. + Make most significant limb be non-zero by setting bit 0 of RAN. */ + ran = random () | 1; + + for (limb_pos = size - 1; limb_pos >= 0; ) + { + n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1; + if ((ran & 1) != 0) + { + /* Generate a string of ones. */ + if (n_bits >= bit_pos) + { + res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1); + bit_pos += BITS_PER_MP_LIMB; + limb = (~(mp_limb_t) 0) << (bit_pos - n_bits); + } + else + { + limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1); + } + } + else + { + /* Generate a string of zeroes. */ + if (n_bits >= bit_pos) + { + res_ptr[limb_pos--] = limb; + limb = 0; + bit_pos += BITS_PER_MP_LIMB; + } + } + bit_pos -= n_bits; + ran = random (); + } +} diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c new file mode 100644 index 0000000000..59caf73529 --- /dev/null +++ b/rts/gmp/mpn/generic/rshift.c @@ -0,0 +1,88 @@ +/* mpn_rshift -- Shift right a low-level natural-number integer. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right + and store the USIZE least significant limbs of the result at WP. + The bits shifted out to the right are returned. + + Argument constraints: + 1. 0 < CNT < BITS_PER_MP_LIMB + 2. If the result is to be written over the input, WP must be <= UP. +*/ + +mp_limb_t +#if __STDC__ +mpn_rshift (register mp_ptr wp, + register mp_srcptr up, mp_size_t usize, + register unsigned int cnt) +#else +mpn_rshift (wp, up, usize, cnt) + register mp_ptr wp; + register mp_srcptr up; + mp_size_t usize; + register unsigned int cnt; +#endif +{ + register mp_limb_t high_limb, low_limb; + register unsigned sh_1, sh_2; + register mp_size_t i; + mp_limb_t retval; + +#ifdef DEBUG + if (usize == 0 || cnt == 0) + abort (); +#endif + + sh_1 = cnt; + +#if 0 + if (sh_1 == 0) + { + if (wp != up) + { + /* Copy from low end to high end, to allow specified input/output + overlapping. */ + for (i = 0; i < usize; i++) + wp[i] = up[i]; + } + return usize; + } +#endif + + wp -= 1; + sh_2 = BITS_PER_MP_LIMB - sh_1; + high_limb = up[0]; + retval = high_limb << sh_2; + low_limb = high_limb; + + for (i = 1; i < usize; i++) + { + high_limb = up[i]; + wp[i] = (low_limb >> sh_1) | (high_limb << sh_2); + low_limb = high_limb; + } + wp[i] = low_limb >> sh_1; + + return retval; +} diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c new file mode 100644 index 0000000000..a269e34f5f --- /dev/null +++ b/rts/gmp/mpn/generic/sb_divrem_mn.c @@ -0,0 +1,201 @@ +/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and + quotient. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + + +Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write + the NSIZE-DSIZE least significant quotient limbs at QP + and the DSIZE long remainder at NP. If QEXTRA_LIMBS is + non-zero, generate that many fraction bits and append them after the + other quotient limbs. + Return the most significant limb of the quotient, this is always 0 or 1. + + Preconditions: + 0. NSIZE >= DSIZE. + 1. The most significant bit of the divisor must be set. + 2. QP must either not overlap with the input operands at all, or + QP + DSIZE >= NP must hold true. (This means that it's + possible to put the quotient in the high part of NUM, right after the + remainder in NUM. + 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero. + 4. DSIZE >= 2. */ + + +#define PREINVERT_VIABLE \ + (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */) + +mp_limb_t +#if __STDC__ +mpn_sb_divrem_mn (mp_ptr qp, + mp_ptr np, mp_size_t nsize, + mp_srcptr dp, mp_size_t dsize) +#else +mpn_sb_divrem_mn (qp, np, nsize, dp, dsize) + mp_ptr qp; + mp_ptr np; + mp_size_t nsize; + mp_srcptr dp; + mp_size_t dsize; +#endif +{ + mp_limb_t most_significant_q_limb = 0; + mp_size_t i; + mp_limb_t dx, d1, n0; + mp_limb_t dxinv; + int have_preinv; + + ASSERT_ALWAYS (dsize > 2); + + np += nsize - dsize; + dx = dp[dsize - 1]; + d1 = dp[dsize - 2]; + n0 = np[dsize - 1]; + + if (n0 >= dx) + { + if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0) + { + mpn_sub_n (np, np, dp, dsize); + most_significant_q_limb = 1; + } + } + + /* If multiplication is much faster than division, preinvert the + most significant divisor limb before entering the loop. */ + if (PREINVERT_VIABLE) + { + have_preinv = 0; + if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME) + { + invert_limb (dxinv, dx); + have_preinv = 1; + } + } + + for (i = nsize - dsize - 1; i >= 0; i--) + { + mp_limb_t q; + mp_limb_t nx; + mp_limb_t cy_limb; + + nx = np[dsize - 1]; + np--; + + if (nx == dx) + { + /* This might over-estimate q, but it's probably not worth + the extra code here to find out. */ + q = ~(mp_limb_t) 0; + +#if 1 + cy_limb = mpn_submul_1 (np, dp, dsize, q); +#else + /* This should be faster on many machines */ + cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize); + cy = mpn_add_n (np, np, dp, dsize); + np[dsize] += cy; +#endif + + if (nx != cy_limb) + { + mpn_add_n (np, np, dp, dsize); + q--; + } + + qp[i] = q; + } + else + { + mp_limb_t rx, r1, r0, p1, p0; + + /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register + usage when np[dsize-1] is used in an asm statement like + umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due + to registers being clobbered. gcc 2.95 i386 doesn't have the + problem. */ + { + mp_limb_t workaround = np[dsize - 1]; + if (PREINVERT_VIABLE && have_preinv) + udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); + else + udiv_qrnnd (q, r1, nx, workaround, dx); + } + umul_ppmm (p1, p0, d1, q); + + r0 = np[dsize - 2]; + rx = 0; + if (r1 < p1 || (r1 == p1 && r0 < p0)) + { + p1 -= p0 < d1; + p0 -= d1; + q--; + r1 += dx; + rx = r1 < dx; + } + + p1 += r0 < p0; /* cannot carry! */ + rx -= r1 < p1; /* may become 11..1 if q is still too large */ + r1 -= p1; + r0 -= p0; + + cy_limb = mpn_submul_1 (np, dp, dsize - 2, q); + + { + mp_limb_t cy1, cy2; + cy1 = r0 < cy_limb; + r0 -= cy_limb; + cy2 = r1 < cy1; + r1 -= cy1; + np[dsize - 1] = r1; + np[dsize - 2] = r0; + if (cy2 != rx) + { + mpn_add_n (np, np, dp, dsize); + q--; + } + } + qp[i] = q; + } + } + + /* ______ ______ ______ + |__rx__|__r1__|__r0__| partial remainder + ______ ______ + - |__p1__|__p0__| partial product to subtract + ______ ______ + - |______|cylimb| + + rx is -1, 0 or 1. If rx=1, then q is correct (it should match + carry out). If rx=-1 then q is too large. If rx=0, then q might + be too large, but it is most likely correct. + */ + + return most_significant_q_limb; +} diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c new file mode 100644 index 0000000000..96f05ce854 --- /dev/null +++ b/rts/gmp/mpn/generic/scan0.c @@ -0,0 +1,62 @@ +/* mpn_scan0 -- Scan from a given bit position for the next clear bit. + +Copyright (C) 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Design issues: + 1. What if starting_bit is not within U? Caller's problem? + 2. Bit index should be 'unsigned'? + + Argument constraints: + 1. U must sooner ot later have a limb with a clear bit. + */ + +unsigned long int +#if __STDC__ +mpn_scan0 (register mp_srcptr up, + register unsigned long int starting_bit) +#else +mpn_scan0 (up, starting_bit) + register mp_srcptr up; + register unsigned long int starting_bit; +#endif +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / BITS_PER_MP_LIMB; + p = up + starting_word; + alimb = ~*p++; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB); + + while (alimb == 0) + alimb = ~*p++; + + count_leading_zeros (cnt, alimb & -alimb); + return (p - up) * BITS_PER_MP_LIMB - 1 - cnt; +} diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c new file mode 100644 index 0000000000..98e2e0dcc0 --- /dev/null +++ b/rts/gmp/mpn/generic/scan1.c @@ -0,0 +1,62 @@ +/* mpn_scan1 -- Scan from a given bit position for the next set bit. + +Copyright (C) 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Design issues: + 1. What if starting_bit is not within U? Caller's problem? + 2. Bit index should be 'unsigned'? + + Argument constraints: + 1. U must sooner ot later have a limb != 0. + */ + +unsigned long int +#if __STDC__ +mpn_scan1 (register mp_srcptr up, + register unsigned long int starting_bit) +#else +mpn_scan1 (up, starting_bit) + register mp_srcptr up; + register unsigned long int starting_bit; +#endif +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / BITS_PER_MP_LIMB; + p = up + starting_word; + alimb = *p++; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB); + + while (alimb == 0) + alimb = *p++; + + count_leading_zeros (cnt, alimb & -alimb); + return (p - up) * BITS_PER_MP_LIMB - 1 - cnt; +} diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c new file mode 100644 index 0000000000..e6ccc92154 --- /dev/null +++ b/rts/gmp/mpn/generic/set_str.c @@ -0,0 +1,159 @@ +/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base) + -- Convert a STR_LEN long base BASE byte string pointed to by STR to a + limb vector pointed to by RES_PTR. Return the number of limbs in + RES_PTR. + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_size_t +#if __STDC__ +mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base) +#else +mpn_set_str (xp, str, str_len, base) + mp_ptr xp; + const unsigned char *str; + size_t str_len; + int base; +#endif +{ + mp_size_t size; + mp_limb_t big_base; + int indigits_per_limb; + mp_limb_t res_digit; + + big_base = __mp_bases[base].big_base; + indigits_per_limb = __mp_bases[base].chars_per_limb; + +/* size = str_len / indigits_per_limb + 1; */ + + size = 0; + + if ((base & (base - 1)) == 0) + { + /* The base is a power of 2. Read the input string from + least to most significant character/digit. */ + + const unsigned char *s; + int next_bitpos; + int bits_per_indigit = big_base; + + res_digit = 0; + next_bitpos = 0; + + for (s = str + str_len - 1; s >= str; s--) + { + int inp_digit = *s; + + res_digit |= (mp_limb_t) inp_digit << next_bitpos; + next_bitpos += bits_per_indigit; + if (next_bitpos >= BITS_PER_MP_LIMB) + { + xp[size++] = res_digit; + next_bitpos -= BITS_PER_MP_LIMB; + res_digit = inp_digit >> (bits_per_indigit - next_bitpos); + } + } + + if (res_digit != 0) + xp[size++] = res_digit; + } + else + { + /* General case. The base is not a power of 2. */ + + size_t i; + int j; + mp_limb_t cy_limb; + + for (i = indigits_per_limb; i < str_len; i += indigits_per_limb) + { + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = 1; j < indigits_per_limb; j++) + res_digit = res_digit * 10 + *str++; + } + else + { + for (j = 1; j < indigits_per_limb; j++) + res_digit = res_digit * base + *str++; + } + + if (size == 0) + { + if (res_digit != 0) + { + xp[0] = res_digit; + size = 1; + } + } + else + { + cy_limb = mpn_mul_1 (xp, xp, size, big_base); + cy_limb += mpn_add_1 (xp, xp, size, res_digit); + if (cy_limb != 0) + xp[size++] = cy_limb; + } + } + + big_base = base; + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = 1; j < str_len - (i - indigits_per_limb); j++) + { + res_digit = res_digit * 10 + *str++; + big_base *= 10; + } + } + else + { + for (j = 1; j < str_len - (i - indigits_per_limb); j++) + { + res_digit = res_digit * base + *str++; + big_base *= base; + } + } + + if (size == 0) + { + if (res_digit != 0) + { + xp[0] = res_digit; + size = 1; + } + } + else + { + cy_limb = mpn_mul_1 (xp, xp, size, big_base); + cy_limb += mpn_add_1 (xp, xp, size, res_digit); + if (cy_limb != 0) + xp[size++] = cy_limb; + } + } + + return size; +} diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c new file mode 100644 index 0000000000..760258a3e0 --- /dev/null +++ b/rts/gmp/mpn/generic/sqr_basecase.c @@ -0,0 +1,83 @@ +/* mpn_sqr_basecase -- Internal routine to square two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +void +#if __STDC__ +mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n) +#else +mpn_sqr_basecase (prodp, up, n) + mp_ptr prodp; + mp_srcptr up; + mp_size_t n; +#endif +{ + mp_size_t i; + + { + /* N.B.! We need the superfluous indirection through argh to work around + a reloader bug in GCC 2.7.*. */ + mp_limb_t x; + mp_limb_t argh; + x = up[0]; + umul_ppmm (argh, prodp[0], x, x); + prodp[1] = argh; + } + if (n > 1) + { + mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= KARATSUBA_SQR_THRESHOLD); + + cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); + tp[n - 1] = cy; + for (i = 2; i < n; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); + tp[n + i - 2] = cy; + } + for (i = 1; i < n; i++) + { + mp_limb_t x; + x = up[i]; + umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x); + } + { + mp_limb_t cy; + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); + cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2); + prodp[2 * n - 1] += cy; + } + } +} diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c new file mode 100644 index 0000000000..ee3b5144dd --- /dev/null +++ b/rts/gmp/mpn/generic/sqrtrem.c @@ -0,0 +1,509 @@ +/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) + + Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR. + Write the remainder at REM_PTR, if REM_PTR != NULL. + Return the size of the remainder. + (The size of the root is always half of the size of the operand.) + + OP_PTR and ROOT_PTR may not point to the same object. + OP_PTR and REM_PTR may point to the same object. + + If REM_PTR is NULL, only the root is computed and the return value of + the function is 0 if OP is a perfect square, and *any* non-zero number + otherwise. + +Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* This code is just correct if "unsigned char" has at least 8 bits. It + doesn't help to use CHAR_BIT from limits.h, as the real problem is + the static arrays. */ + +#include <stdio.h> /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Square root algorithm: + + 1. Shift OP (the input) to the left an even number of bits s.t. there + are an even number of words and either (or both) of the most + significant bits are set. This way, sqrt(OP) has exactly half as + many words as OP, and has its most significant bit set. + + 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables. + This approximation is used for the first single-precision + iterations of Newton's method, yielding a full-word approximation + to sqrt(OP). + + 3. Perform multiple-precision Newton iteration until we have the + exact result. Only about half of the input operand is used in + this calculation, as the square root is perfectly determinable + from just the higher half of a number. */ + +/* Define this macro for IEEE P854 machines with a fast sqrt instruction. */ +#if defined __GNUC__ && ! defined __SOFT_FLOAT + +#if defined (__sparc__) && BITS_PER_MP_LIMB == 32 +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \ + __sqrt_res; \ + }) +#endif + +#if defined (__HAVE_68881__) +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \ + __sqrt_res; \ + }) +#endif + +#if defined (__hppa) && BITS_PER_MP_LIMB == 32 +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \ + __sqrt_res; \ + }) +#endif + +#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32 +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \ + __sqrt_res; \ + }) +#endif + +#if 0 +#if defined (__i386__) || defined (__i486__) +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \ + __sqrt_res; \ + }) +#endif +#endif + +#endif + +#ifndef SQRT + +/* Tables for initial approximation of the square root. These are + indexed with bits 1-8 of the operand for which the square root is + calculated, where bit 0 is the most significant non-zero bit. I.e. + the most significant one-bit is not used, since that per definition + is one. Likewise, the tables don't return the highest bit of the + result. That bit must be inserted by or:ing the returned value with + 0x100. This way, we get a 9-bit approximation from 8-bit tables! */ + +/* Table to be used for operands with an even total number of bits. + (Exactly as in the decimal system there are similarities between the + square root of numbers with the same initial digits and an even + difference in the total number of digits. Consider the square root + of 1, 10, 100, 1000, ...) */ +static const unsigned char even_approx_tab[256] = +{ + 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e, + 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74, + 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79, + 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f, + 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84, + 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89, + 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f, + 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94, + 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99, + 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e, + 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3, + 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7, + 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac, + 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1, + 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6, + 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba, + 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf, + 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3, + 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8, + 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc, + 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1, + 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5, + 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda, + 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde, + 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2, + 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6, + 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb, + 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef, + 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3, + 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7, + 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb, + 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff, +}; + +/* Table to be used for operands with an odd total number of bits. + (Further comments before previous table.) */ +static const unsigned char odd_approx_tab[256] = +{ + 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, + 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, + 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, + 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a, + 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d, + 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21, + 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24, + 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28, + 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b, + 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f, + 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32, + 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35, + 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39, + 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c, + 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f, + 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42, + 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45, + 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49, + 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c, + 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f, + 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52, + 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55, + 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58, + 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b, + 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e, + 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61, + 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63, + 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66, + 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69, +}; +#endif + + +mp_size_t +#if __STDC__ +mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size) +#else +mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) + mp_ptr root_ptr; + mp_ptr rem_ptr; + mp_srcptr op_ptr; + mp_size_t op_size; +#endif +{ + /* R (root result) */ + mp_ptr rp; /* Pointer to least significant word */ + mp_size_t rsize; /* The size in words */ + + /* T (OP shifted to the left a.k.a. normalized) */ + mp_ptr tp; /* Pointer to least significant word */ + mp_size_t tsize; /* The size in words */ + mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */ + mp_limb_t t_high0, t_high1; /* The two most significant words */ + + /* TT (temporary for numerator/remainder) */ + mp_ptr ttp; /* Pointer to least significant word */ + + /* X (temporary for quotient in main loop) */ + mp_ptr xp; /* Pointer to least significant word */ + mp_size_t xsize; /* The size in words */ + + unsigned cnt; + mp_limb_t initial_approx; /* Initially made approximation */ + mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */ + mp_size_t tmp; + mp_size_t i; + + mp_limb_t cy_limb; + TMP_DECL (marker); + + /* If OP is zero, both results are zero. */ + if (op_size == 0) + return 0; + + count_leading_zeros (cnt, op_ptr[op_size - 1]); + tsize = op_size; + if ((tsize & 1) != 0) + { + cnt += BITS_PER_MP_LIMB; + tsize++; + } + + rsize = tsize / 2; + rp = root_ptr; + + TMP_MARK (marker); + + /* Shift OP an even number of bits into T, such that either the most or + the second most significant bit is set, and such that the number of + words in T becomes even. This way, the number of words in R=sqrt(OP) + is exactly half as many as in OP, and the most significant bit of R + is set. + + Also, the initial approximation is simplified by this up-shifted OP. + + Finally, the Newtonian iteration which is the main part of this + program performs division by R. The fast division routine expects + the divisor to be "normalized" in exactly the sense of having the + most significant bit set. */ + + tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); + + if ((cnt & ~1) % BITS_PER_MP_LIMB != 0) + t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size, + (cnt & ~1) % BITS_PER_MP_LIMB); + else + MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size); + + if (cnt >= BITS_PER_MP_LIMB) + tp[0] = 0; + + t_high0 = tp[tsize - 1]; + t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */ + +/* Is there a fast sqrt instruction defined for this machine? */ +#ifdef SQRT + { + initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1); + /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have + become incorrect due to overflow in the conversion from double to + mp_limb_t above. It will typically be zero in that case, but might be + a small number on some machines. The most significant bit of + INITIAL_APPROX should be set, so that bit is a good overflow + indication. */ + if ((mp_limb_signed_t) initial_approx >= 0) + initial_approx = ~(mp_limb_t)0; + } +#else + /* Get a 9 bit approximation from the tables. The tables expect to + be indexed with the 8 high bits right below the highest bit. + Also, the highest result bit is not returned by the tables, and + must be or:ed into the result. The scheme gives 9 bits of start + approximation with just 256-entry 8 bit tables. */ + + if ((cnt & 1) == 0) + { + /* The most significant bit of t_high0 is set. */ + initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1); + initial_approx &= 0xff; + initial_approx = even_approx_tab[initial_approx]; + } + else + { + /* The most significant bit of t_high0 is unset, + the second most significant is set. */ + initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2); + initial_approx &= 0xff; + initial_approx = odd_approx_tab[initial_approx]; + } + initial_approx |= 0x100; + initial_approx <<= BITS_PER_MP_LIMB - 8 - 1; + + /* Perform small precision Newtonian iterations to get a full word + approximation. For small operands, these iterations will do the + entire job. */ + if (t_high0 == ~(mp_limb_t)0) + initial_approx = t_high0; + else + { + mp_limb_t quot; + + if (t_high0 >= initial_approx) + initial_approx = t_high0 + 1; + + /* First get about 18 bits with pure C arithmetics. */ + quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2; + initial_approx = (initial_approx + quot) / 2; + initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + + /* Now get a full word by one (or for > 36 bit machines) several + iterations. */ + for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1) + { + mp_limb_t ignored_remainder; + + udiv_qrnnd (quot, ignored_remainder, + t_high0, t_high1, initial_approx); + initial_approx = (initial_approx + quot) / 2; + initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + } + } +#endif + + rp[0] = initial_approx; + rsize = 1; + +#ifdef SQRT_DEBUG + printf ("\n\nT = "); + mpn_dump (tp, tsize); +#endif + + if (tsize > 2) + { + /* Determine the successive precisions to use in the iteration. We + minimize the precisions, beginning with the highest (i.e. last + iteration) to the lowest (i.e. first iteration). */ + + xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); + ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); + + t_end_ptr = tp + tsize; + + tmp = tsize / 2; + for (i = 0;; i++) + { + tsize = (tmp + 1) / 2; + if (tmp == tsize) + break; + tsizes[i] = tsize + tmp; + tmp = tsize; + } + + /* Main Newton iteration loop. For big arguments, most of the + time is spent here. */ + + /* It is possible to do a great optimization here. The successive + divisors in the mpn_divmod call below have more and more leading + words equal to its predecessor. Therefore the beginning of + each division will repeat the same work as did the last + division. If we could guarantee that the leading words of two + consecutive divisors are the same (i.e. in this case, a later + divisor has just more digits at the end) it would be a simple + matter of just using the old remainder of the last division in + a subsequent division, to take care of this optimization. This + idea would surely make a difference even for small arguments. */ + + /* Loop invariants: + + R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1. + X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X. + R <= shiftdown_to_same_size(X). */ + + while (--i >= 0) + { + mp_limb_t cy; +#ifdef SQRT_DEBUG + mp_limb_t old_least_sign_r = rp[0]; + mp_size_t old_rsize = rsize; + + printf ("R = "); + mpn_dump (rp, rsize); +#endif + tsize = tsizes[i]; + + /* Need to copy the numerator into temporary space, as + mpn_divmod overwrites its numerator argument with the + remainder (which we currently ignore). */ + MPN_COPY (ttp, t_end_ptr - tsize, tsize); + cy = mpn_divmod (xp, ttp, tsize, rp, rsize); + xsize = tsize - rsize; + +#ifdef SQRT_DEBUG + printf ("X =%d ", cy); + mpn_dump (xp, xsize); +#endif + + /* Add X and R with the most significant limbs aligned, + temporarily ignoring at least one limb at the low end of X. */ + tmp = xsize - rsize; + cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize); + + /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get + intermediate roots that'd need an extra bit. We don't want to + handle that since it would make the subsequent divisor + non-normalized, so round such roots down to be only ones in the + current precision. */ + if (cy == 2) + { + mp_size_t j; + for (j = xsize; j >= 0; j--) + xp[j] = ~(mp_limb_t)0; + } + + /* Divide X by 2 and put the result in R. This is the new + approximation. Shift in the carry from the addition. */ + mpn_rshift (rp, xp, xsize, 1); + rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)); + rsize = xsize; +#ifdef SQRT_DEBUG + if (old_least_sign_r != rp[rsize - old_rsize]) + printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n", + i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r, + 2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]); +#endif + } + } + +#ifdef SQRT_DEBUG + printf ("(final) R = "); + mpn_dump (rp, rsize); +#endif + + /* We computed the square root of OP * 2**(2*floor(cnt/2)). + This has resulted in R being 2**floor(cnt/2) to large. + Shift it down here to fix that. */ + if (cnt / 2 != 0) + { + mpn_rshift (rp, rp, rsize, cnt/2); + rsize -= rp[rsize - 1] == 0; + } + + /* Calculate the remainder. */ + mpn_mul_n (tp, rp, rp, rsize); + tsize = rsize + rsize; + tsize -= tp[tsize - 1] == 0; + if (op_size < tsize + || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0)) + { + /* R is too large. Decrement it. */ + + /* These operations can't overflow. */ + cy_limb = mpn_sub_n (tp, tp, rp, rsize); + cy_limb += mpn_sub_n (tp, tp, rp, rsize); + mpn_decr_u (tp + rsize, cy_limb); + mpn_incr_u (tp, (mp_limb_t) 1); + + mpn_decr_u (rp, (mp_limb_t) 1); + +#ifdef SQRT_DEBUG + printf ("(adjusted) R = "); + mpn_dump (rp, rsize); +#endif + } + + if (rem_ptr != NULL) + { + cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize); + MPN_NORMALIZE (rem_ptr, op_size); + TMP_FREE (marker); + return op_size; + } + else + { + int res; + res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size); + TMP_FREE (marker); + return res; + } +} diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c new file mode 100644 index 0000000000..4f2f06099c --- /dev/null +++ b/rts/gmp/mpn/generic/sub_n.c @@ -0,0 +1,62 @@ +/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +#if __STDC__ +mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size) +#else +mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + register mp_srcptr s2_ptr; + mp_size_t size; +#endif +{ + register mp_limb_t x, y, cy; + register mp_size_t j; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + s2_ptr -= j; + res_ptr -= j; + + cy = 0; + do + { + y = s2_ptr[j]; + x = s1_ptr[j]; + y += cy; /* add previous carry to subtrahend */ + cy = (y < cy); /* get out carry from that addition */ + y = x - y; /* main subtract */ + cy = (y > x) + cy; /* get out carry from the subtract, combine */ + res_ptr[j] = y; + } + while (++j != 0); + + return cy; +} diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c new file mode 100644 index 0000000000..c7c08ee4af --- /dev/null +++ b/rts/gmp/mpn/generic/submul_1.c @@ -0,0 +1,65 @@ +/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR + by S2_LIMB, subtract the S1_SIZE least significant limbs of the product + from the limb vector pointed to by RES_PTR. Return the most significant + limb of the product, adjusted for carry-out from the subtraction. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + mp_size_t s1_size; + register mp_limb_t s2_limb; +{ + register mp_limb_t cy_limb; + register mp_size_t j; + register mp_limb_t prod_high, prod_low; + register mp_limb_t x; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + res_ptr -= j; + s1_ptr -= j; + + cy_limb = 0; + do + { + umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb) + prod_high; + + x = res_ptr[j]; + prod_low = x - prod_low; + cy_limb += (prod_low > x); + res_ptr[j] = prod_low; + } + while (++j != 0); + + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c new file mode 100644 index 0000000000..b748b5d810 --- /dev/null +++ b/rts/gmp/mpn/generic/tdiv_qr.c @@ -0,0 +1,401 @@ +/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and + write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If + qxn is non-zero, generate that many fraction limbs and append them after the + other quotient limbs, and update the remainder accordningly. The input + operands are unaffected. + + Preconditions: + 1. The most significant limb of of the divisor must be non-zero. + 2. No argument overlap is permitted. (??? relax this ???) + 3. nn >= dn, even if qxn is non-zero. (??? relax this ???) + + The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time + complexity of multiplication. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD) +#endif + +/* Extract the middle limb from ((h,,l) << cnt) */ +#define SHL(h,l,cnt) \ + ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1)))) + +void +#if __STDC__ +mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) +#else +mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_ptr rp; + mp_size_t qxn; + mp_srcptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +#endif +{ + /* FIXME: + 1. qxn + 2. pass allocated storage in additional parameter? + */ + if (qxn != 0) + abort (); + + switch (dn) + { + case 0: + DIVIDE_BY_ZERO; + + case 1: + { + rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]); + return; + } + + case 2: + { + int cnt; + mp_ptr n2p, d2p; + mp_limb_t qhl, cy; + TMP_DECL (marker); + TMP_MARK (marker); + count_leading_zeros (cnt, dp[dn - 1]); + if (cnt != 0) + { + d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_lshift (d2p, dp, dn, cnt); + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p); + if (cy == 0) + qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */ + } + else + { + d2p = (mp_ptr) dp; + n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np, nn); + qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */ + } + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE (marker); + return; + } + + default: + { + int adjust; + TMP_DECL (marker); + TMP_MARK (marker); + adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */ + if (nn + adjust >= 2 * dn) + { + mp_ptr n2p, d2p; + mp_limb_t cy; + int cnt; + count_leading_zeros (cnt, dp[dn - 1]); + + qp[nn - dn] = 0; /* zero high quotient limb */ + if (cnt != 0) /* normalize divisor if needed */ + { + d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_lshift (d2p, dp, dn, cnt); + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + nn += adjust; + } + else + { + d2p = (mp_ptr) dp; + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np, nn); + n2p[nn] = 0; + nn += adjust; + } + + if (dn == 2) + mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + else if (dn < BZ_THRESHOLD) + mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn); + else + { + /* Perform 2*dn / dn limb divisions as long as the limbs + in np last. */ + mp_ptr q2p = qp + nn - 2 * dn; + n2p += nn - 2 * dn; + mpn_bz_divrem_n (q2p, n2p, d2p, dn); + nn -= dn; + while (nn >= 2 * dn) + { + mp_limb_t c; + q2p -= dn; n2p -= dn; + c = mpn_bz_divrem_n (q2p, n2p, d2p, dn); + ASSERT_ALWAYS (c == 0); + nn -= dn; + } + + if (nn != dn) + { + n2p -= nn - dn; + /* In theory, we could fall out to the cute code below + since we now have exactly the situation that code + is designed to handle. We botch this badly and call + the basic mpn_sb_divrem_mn! */ + if (dn == 2) + mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + else + mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn); + } + } + + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE (marker); + return; + } + + /* When we come here, the numerator/partial remainder is less + than twice the size of the denominator. */ + + { + /* Problem: + + Divide a numerator N with nn limbs by a denominator D with dn + limbs forming a quotient of nn-dn+1 limbs. When qn is small + compared to dn, conventional division algorithms perform poorly. + We want an algorithm that has an expected running time that is + dependent only on qn. It is assumed that the most significant + limb of the numerator is smaller than the most significant limb + of the denominator. + + Algorithm (very informally stated): + + 1) Divide the 2 x qn most significant limbs from the numerator + by the qn most significant limbs from the denominator. Call + the result qest. This is either the correct quotient, but + might be 1 or 2 too large. Compute the remainder from the + division. (This step is implemented by a mpn_divrem call.) + + 2) Is the most significant limb from the remainder < p, where p + is the product of the most significant limb from the quotient + and the next(d). (Next(d) denotes the next ignored limb from + the denominator.) If it is, decrement qest, and adjust the + remainder accordingly. + + 3) Is the remainder >= qest? If it is, qest is the desired + quotient. The algorithm terminates. + + 4) Subtract qest x next(d) from the remainder. If there is + borrow out, decrement qest, and adjust the remainder + accordingly. + + 5) Skip one word from the denominator (i.e., let next(d) denote + the next less significant limb. */ + + mp_size_t qn; + mp_ptr n2p, d2p; + mp_ptr tp; + mp_limb_t cy; + mp_size_t in, rn; + mp_limb_t quotient_too_large; + int cnt; + + qn = nn - dn; + qp[qn] = 0; /* zero high quotient limb */ + qn += adjust; /* qn cannot become bigger */ + + if (qn == 0) + { + MPN_COPY (rp, np, dn); + TMP_FREE (marker); + return; + } + + in = dn - qn; /* (at least partially) ignored # of limbs in ops */ + /* Normalize denominator by shifting it to the left such that its + most significant bit is set. Then shift the numerator the same + amount, to mathematically preserve quotient. */ + count_leading_zeros (cnt, dp[dn - 1]); + if (cnt != 0) + { + d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB); + + mpn_lshift (d2p, dp + in, qn, cnt); + d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt); + + n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt); + if (adjust) + { + n2p[2 * qn] = cy; + n2p++; + } + else + { + n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt); + } + } + else + { + d2p = (mp_ptr) dp + in; + + n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn); + if (adjust) + { + n2p[2 * qn] = 0; + n2p++; + } + } + + /* Get an approximate quotient using the extracted operands. */ + if (qn == 1) + { + mp_limb_t q0, r0; + mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0; + /* Due to a gcc 2.7.2.3 reload pass bug, we have to use some + temps here. This doesn't hurt code quality on any machines + so we do it unconditionally. */ + gcc272bug_n1 = n2p[1]; + gcc272bug_n0 = n2p[0]; + gcc272bug_d0 = d2p[0]; + udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0); + n2p[0] = r0; + qp[0] = q0; + } + else if (qn == 2) + mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); + else if (qn < BZ_THRESHOLD) + mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn); + else + mpn_bz_divrem_n (qp, n2p, d2p, qn); + + rn = qn; + /* Multiply the first ignored divisor limb by the most significant + quotient limb. If that product is > the partial remainder's + most significant limb, we know the quotient is too large. This + test quickly catches most cases where the quotient is too large; + it catches all cases where the quotient is 2 too large. */ + { + mp_limb_t dl, x; + mp_limb_t h, l; + + if (in - 2 < 0) + dl = 0; + else + dl = dp[in - 2]; + + x = SHL (dp[in - 1], dl, cnt); + umul_ppmm (h, l, x, qp[qn - 1]); + + if (n2p[qn - 1] < h) + { + mp_limb_t cy; + + mpn_decr_u (qp, (mp_limb_t) 1); + cy = mpn_add_n (n2p, n2p, d2p, qn); + if (cy) + { + /* The partial remainder is safely large. */ + n2p[qn] = cy; + ++rn; + } + } + } + + quotient_too_large = 0; + if (cnt != 0) + { + mp_limb_t cy1, cy2; + + /* Append partially used numerator limb to partial remainder. */ + cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt); + n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt); + + /* Update partial remainder with partially used divisor limb. */ + cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt)); + if (qn != rn) + { + if (n2p[qn] < cy2) + abort (); + n2p[qn] -= cy2; + } + else + { + n2p[qn] = cy1 - cy2; + + quotient_too_large = (cy1 < cy2); + ++rn; + } + --in; + } + /* True: partial remainder now is neutral, i.e., it is not shifted up. */ + + tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + + if (in < qn) + { + if (in == 0) + { + MPN_COPY (rp, n2p, rn); + if (rn != dn) + abort (); + goto foo; + } + mpn_mul (tp, qp, qn, dp, in); + } + else + mpn_mul (tp, dp, in, qp, qn); + + cy = mpn_sub (n2p, n2p, rn, tp + in, qn); + MPN_COPY (rp + in, n2p, dn - in); + quotient_too_large |= cy; + cy = mpn_sub_n (rp, np, tp, in); + cy = mpn_sub_1 (rp + in, rp + in, rn, cy); + quotient_too_large |= cy; + foo: + if (quotient_too_large) + { + mpn_decr_u (qp, (mp_limb_t) 1); + mpn_add_n (rp, rp, dp, dn); + } + } + TMP_FREE (marker); + return; + } + } +} diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c new file mode 100644 index 0000000000..061cce86e1 --- /dev/null +++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c @@ -0,0 +1,131 @@ +/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed + division. + + Contributed by Peter L. Montgomery. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE + TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE + GNU MP RELEASE. + + +Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_udiv_w_sdiv (rp, a1, a0, d) + mp_limb_t *rp, a1, a0, d; +{ + mp_limb_t q, r; + mp_limb_t c0, c1, b1; + + if ((mp_limb_signed_t) d >= 0) + { + if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1))) + { + /* dividend, divisor, and quotient are nonnegative */ + sdiv_qrnnd (q, r, a1, a0, d); + } + else + { + /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */ + sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1)); + /* Divide (c1*2^32 + c0) by d */ + sdiv_qrnnd (q, r, c1, c0, d); + /* Add 2^31 to quotient */ + q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + } + } + else + { + b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */ + c1 = a1 >> 1; /* A/2 */ + c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1); + + if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */ + { + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */ + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */ + { + c1 = (b1 - 1) - c1; + c0 = ~c0; /* logical NOT */ + + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + q = ~q; /* (A/2)/b1 */ + r = (b1 - 1) - r; + + r = 2*r + (a0 & 1); /* A/(2*b1) */ + + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else /* Implies c1 = b1 */ + { /* Hence a1 = d - 1 = 2*b1 - 1 */ + if (a0 >= -d) + { + q = -1; + r = a0 + d; + } + else + { + q = -2; + r = a0 + 2*d; + } + } + } + + *rp = r; + return q; +} |