summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn/generic
diff options
context:
space:
mode:
Diffstat (limited to 'rts/gmp/mpn/generic')
-rw-r--r--rts/gmp/mpn/generic/add_n.c62
-rw-r--r--rts/gmp/mpn/generic/addmul_1.c65
-rw-r--r--rts/gmp/mpn/generic/addsub_n.c167
-rw-r--r--rts/gmp/mpn/generic/bdivmod.c120
-rw-r--r--rts/gmp/mpn/generic/bz_divrem_n.c153
-rw-r--r--rts/gmp/mpn/generic/cmp.c56
-rw-r--r--rts/gmp/mpn/generic/diveby3.c77
-rw-r--r--rts/gmp/mpn/generic/divrem.c101
-rw-r--r--rts/gmp/mpn/generic/divrem_1.c248
-rw-r--r--rts/gmp/mpn/generic/divrem_2.c151
-rw-r--r--rts/gmp/mpn/generic/dump.c76
-rw-r--r--rts/gmp/mpn/generic/gcd.c414
-rw-r--r--rts/gmp/mpn/generic/gcd_1.c77
-rw-r--r--rts/gmp/mpn/generic/gcdext.c700
-rw-r--r--rts/gmp/mpn/generic/get_str.c216
-rw-r--r--rts/gmp/mpn/generic/gmp-mparam.h27
-rw-r--r--rts/gmp/mpn/generic/hamdist.c94
-rw-r--r--rts/gmp/mpn/generic/inlines.c24
-rw-r--r--rts/gmp/mpn/generic/jacbase.c136
-rw-r--r--rts/gmp/mpn/generic/lshift.c87
-rw-r--r--rts/gmp/mpn/generic/mod_1.c175
-rw-r--r--rts/gmp/mpn/generic/mod_1_rs.c111
-rw-r--r--rts/gmp/mpn/generic/mul.c190
-rw-r--r--rts/gmp/mpn/generic/mul_1.c59
-rw-r--r--rts/gmp/mpn/generic/mul_basecase.c87
-rw-r--r--rts/gmp/mpn/generic/mul_fft.c772
-rw-r--r--rts/gmp/mpn/generic/mul_n.c1343
-rw-r--r--rts/gmp/mpn/generic/perfsqr.c123
-rw-r--r--rts/gmp/mpn/generic/popcount.c93
-rw-r--r--rts/gmp/mpn/generic/pre_mod_1.c69
-rw-r--r--rts/gmp/mpn/generic/random.c43
-rw-r--r--rts/gmp/mpn/generic/random2.c105
-rw-r--r--rts/gmp/mpn/generic/rshift.c88
-rw-r--r--rts/gmp/mpn/generic/sb_divrem_mn.c201
-rw-r--r--rts/gmp/mpn/generic/scan0.c62
-rw-r--r--rts/gmp/mpn/generic/scan1.c62
-rw-r--r--rts/gmp/mpn/generic/set_str.c159
-rw-r--r--rts/gmp/mpn/generic/sqr_basecase.c83
-rw-r--r--rts/gmp/mpn/generic/sqrtrem.c509
-rw-r--r--rts/gmp/mpn/generic/sub_n.c62
-rw-r--r--rts/gmp/mpn/generic/submul_1.c65
-rw-r--r--rts/gmp/mpn/generic/tdiv_qr.c401
-rw-r--r--rts/gmp/mpn/generic/udiv_w_sdiv.c131
43 files changed, 8044 insertions, 0 deletions
diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000000..5fcb7e4835
--- /dev/null
+++ b/rts/gmp/mpn/generic/add_n.c
@@ -0,0 +1,62 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ register mp_srcptr s2_ptr;
+ mp_size_t size;
+#endif
+{
+ register mp_limb_t x, y, cy;
+ register mp_size_t j;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ s2_ptr -= j;
+ res_ptr -= j;
+
+ cy = 0;
+ do
+ {
+ y = s2_ptr[j];
+ x = s1_ptr[j];
+ y += cy; /* add previous carry to one addend */
+ cy = (y < cy); /* get out carry from that addition */
+ y = x + y; /* add other addend */
+ cy = (y < x) + cy; /* get out carry from that add, combine */
+ res_ptr[j] = y;
+ }
+ while (++j != 0);
+
+ return cy;
+}
diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000000..746ae31307
--- /dev/null
+++ b/rts/gmp/mpn/generic/addmul_1.c
@@ -0,0 +1,65 @@
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+ by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
+ limb vector pointed to by RES_PTR. Return the most significant limb of
+ the product, adjusted for carry-out from the addition.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ mp_size_t s1_size;
+ register mp_limb_t s2_limb;
+{
+ register mp_limb_t cy_limb;
+ register mp_size_t j;
+ register mp_limb_t prod_high, prod_low;
+ register mp_limb_t x;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ res_ptr -= j;
+ s1_ptr -= j;
+
+ cy_limb = 0;
+ do
+ {
+ umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb) + prod_high;
+
+ x = res_ptr[j];
+ prod_low = x + prod_low;
+ cy_limb += (prod_low < x);
+ res_ptr[j] = prod_low;
+ }
+ while (++j != 0);
+
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c
new file mode 100644
index 0000000000..c9bab3ef60
--- /dev/null
+++ b/rts/gmp/mpn/generic/addsub_n.c
@@ -0,0 +1,167 @@
+/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef L1_CACHE_SIZE
+#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */
+#endif
+
+#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
+
+
+/* mpn_addsub_n.
+ r1[] = s1[] + s2[]
+ r2[] = s1[] - s2[]
+ All operands have n limbs.
+ In-place operations allowed. */
+mp_limb_t
+#if __STDC__
+mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+#else
+mpn_addsub_n (r1p, r2p, s1p, s2p, n)
+ mp_ptr r1p, r2p;
+ mp_srcptr s1p, s2p;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t acyn, acyo; /* carry for add */
+ mp_limb_t scyn, scyo; /* carry for subtract */
+ mp_size_t off; /* offset in operands */
+ mp_size_t this_n; /* size of current chunk */
+
+ /* We alternatingly add and subtract in chunks that fit into the (L1)
+ cache. Since the chunks are several hundred limbs, the function call
+ overhead is insignificant, but we get much better locality. */
+
+ /* We have three variant of the inner loop, the proper loop is chosen
+ depending on whether r1 or r2 are the same operand as s1 or s2. */
+
+ if (r1p != s1p && r1p != s2p)
+ {
+ /* r1 is not identical to either input operand. We can therefore write
+ to r1 directly, without using temporary storage. */
+ acyo = 0;
+ scyo = 0;
+ for (off = 0; off < n; off += PART_SIZE)
+ {
+ this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+ acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+ acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+ acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+ scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+ scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+ scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+ }
+ }
+ else if (r2p != s1p && r2p != s2p)
+ {
+ /* r2 is not identical to either input operand. We can therefore write
+ to r2 directly, without using temporary storage. */
+ acyo = 0;
+ scyo = 0;
+ for (off = 0; off < n; off += PART_SIZE)
+ {
+ this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+ scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+ scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+ scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+ acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+ acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+ acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+ }
+ }
+ else
+ {
+ /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
+ Need temporary storage. */
+ mp_limb_t tp[PART_SIZE];
+ acyo = 0;
+ scyo = 0;
+ for (off = 0; off < n; off += PART_SIZE)
+ {
+ this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+ acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
+#else
+ acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
+ acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+ scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+ scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+ scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+ MPN_COPY (r1p + off, tp, this_n);
+ }
+ }
+
+ return 2 * acyo + scyo;
+}
+
+#ifdef MAIN
+#include <stdlib.h>
+#include <stdio.h>
+#include "timing.h"
+
+long cputime ();
+
+int
+main (int argc, char **argv)
+{
+ mp_ptr r1p, r2p, s1p, s2p;
+ double t;
+ mp_size_t n;
+
+ n = strtol (argv[1], 0, 0);
+
+ r1p = malloc (n * BYTES_PER_MP_LIMB);
+ r2p = malloc (n * BYTES_PER_MP_LIMB);
+ s1p = malloc (n * BYTES_PER_MP_LIMB);
+ s2p = malloc (n * BYTES_PER_MP_LIMB);
+ TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
+ printf (" separate add and sub: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
+ printf ("combined addsub separate variables: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+ printf (" combined addsub r1 overlap: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+ printf (" combined addsub r2 overlap: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
+ printf (" combined addsub in-place: %.3f\n", t);
+
+ return 0;
+}
+#endif
diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000000..c4bcb414e6
--- /dev/null
+++ b/rts/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,120 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+ Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+ returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+ Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up. Since the
+ low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+ the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+ Preconditions:
+ 1. V is odd.
+ 2. usize * BITS_PER_MP_LIMB >= d.
+ 3. If Q and U overlap, qp <= up.
+
+ Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+ Funding for this work has been partially provided by Conselho Nacional
+ de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+ 301314194-2, and was done while I was a visiting reseacher in the Instituto
+ de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+ References:
+ T. Jebelean, An algorithm for exact division, Journal of Symbolic
+ Computation, v. 15, 1993, pp. 169-180.
+
+ K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+ Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+ mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+#else
+mpn_bdivmod (qp, up, usize, vp, vsize, d)
+ mp_ptr qp;
+ mp_ptr up;
+ mp_size_t usize;
+ mp_srcptr vp;
+ mp_size_t vsize;
+ unsigned long int d;
+#endif
+{
+ mp_limb_t v_inv;
+
+ /* 1/V mod 2^BITS_PER_MP_LIMB. */
+ modlimb_invert (v_inv, vp[0]);
+
+ /* Fast code for two cases previously used by the accel part of mpn_gcd.
+ (Could probably remove this now it's inlined there.) */
+ if (usize == 2 && vsize == 2 &&
+ (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB))
+ {
+ mp_limb_t hi, lo;
+ mp_limb_t q = up[0] * v_inv;
+ umul_ppmm (hi, lo, q, vp[0]);
+ up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q;
+ if (d == 2*BITS_PER_MP_LIMB)
+ q = up[1] * v_inv, up[1] = 0, qp[1] = q;
+ return 0;
+ }
+
+ /* Main loop. */
+ while (d >= BITS_PER_MP_LIMB)
+ {
+ mp_limb_t q = up[0] * v_inv;
+ mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+ if (usize > vsize)
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ d -= BITS_PER_MP_LIMB;
+ up += 1, usize -= 1;
+ *qp++ = q;
+ }
+
+ if (d)
+ {
+ mp_limb_t b;
+ mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+ if (q <= 1)
+ {
+ if (q == 0)
+ return 0;
+ else
+ b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
+ }
+ else
+ b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+
+ if (usize > vsize)
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ return q;
+ }
+
+ return 0;
+}
diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c
new file mode 100644
index 0000000000..d234b22af5
--- /dev/null
+++ b/rts/gmp/mpn/generic/bz_divrem_n.c
@@ -0,0 +1,153 @@
+/* mpn_bz_divrem_n and auxilliary routines.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+ INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+ IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+ FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
+ Technical report MPI-I-98-1-022, october 1998.
+ http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
+*/
+
+static mp_limb_t mpn_bz_div_3_halves_by_2
+ _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n));
+
+
+/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
+ div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
+ i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#if 0
+static
+unused_mpn_divrem (qp, qxn, np, nn, dp, dn)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_ptr np;
+ mp_size_t nn;
+ mp_srcptr dp;
+ mp_size_t dn;
+{
+ /* This might be useful: */
+ if (qxn != 0)
+ {
+ mp_limb_t c;
+ mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB);
+ MPN_COPY (tp + qxn - nn, np, nn);
+ MPN_ZERO (tp, qxn);
+ c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn);
+ /* Maybe copy proper part of tp to np? Documentation is unclear about
+ the returned np value when qxn != 0 */
+ return c;
+ }
+}
+#endif
+
+
+/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
+ by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
+ Returns most significant limb of the quotient, which is 0 or 1.
+ Requires that the most significant bit of the divisor is set. */
+
+mp_limb_t
+#if __STDC__
+mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_divrem_n (qp, np, dp, n)
+ mp_ptr qp;
+ mp_ptr np;
+ mp_srcptr dp;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t qhl, cc;
+
+ if (n % 2 != 0)
+ {
+ qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1);
+ cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]);
+ cc = mpn_sub_1 (np + n, np + n, 1, cc);
+ if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
+ while (cc)
+ {
+ qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1);
+ cc -= mpn_add_n (np + 1, np + 1, dp, n);
+ }
+ qhl += mpn_add_1 (qp + 1, qp + 1, n - 1,
+ mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
+ }
+ else
+ {
+ mp_size_t n2 = n/2;
+ qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2);
+ qhl += mpn_add_1 (qp + n2, qp + n2, n2,
+ mpn_bz_div_3_halves_by_2 (qp, np, dp, n2));
+ }
+ return qhl;
+}
+
+
+/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
+ the remainder in (np, 2n) */
+
+static mp_limb_t
+#if __STDC__
+mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_div_3_halves_by_2 (qp, np, dp, n)
+ mp_ptr qp;
+ mp_ptr np;
+ mp_srcptr dp;
+ mp_size_t n;
+#endif
+{
+ mp_size_t twon = n + n;
+ mp_limb_t qhl, cc;
+ mp_ptr tmp;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ if (n < BZ_THRESHOLD)
+ qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
+ else
+ qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n);
+ tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB);
+ mpn_mul_n (tmp, qp, dp, n);
+ cc = mpn_sub_n (np, np, tmp, twon);
+ TMP_FREE (marker);
+ if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n);
+ while (cc)
+ {
+ qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
+ cc -= mpn_add_n (np, np, dp, twon);
+ }
+ return qhl;
+}
diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000000..8e9792f54e
--- /dev/null
+++ b/rts/gmp/mpn/generic/cmp.c
@@ -0,0 +1,56 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+ There are no restrictions on the relative sizes of
+ the two arguments.
+ Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2. */
+
+int
+#if __STDC__
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+#else
+mpn_cmp (op1_ptr, op2_ptr, size)
+ mp_srcptr op1_ptr;
+ mp_srcptr op2_ptr;
+ mp_size_t size;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t op1_word, op2_word;
+
+ for (i = size - 1; i >= 0; i--)
+ {
+ op1_word = op1_ptr[i];
+ op2_word = op2_ptr[i];
+ if (op1_word != op2_word)
+ goto diff;
+ }
+ return 0;
+ diff:
+ /* This can *not* be simplified to
+ op2_word - op2_word
+ since that expression might give signed overflow. */
+ return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c
new file mode 100644
index 0000000000..a2fb552bfa
--- /dev/null
+++ b/rts/gmp/mpn/generic/diveby3.c
@@ -0,0 +1,77 @@
+/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+ 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+
+/* The "c += ..."s are adding the high limb of 3*l to c. That high limb
+ will be 0, 1 or 2. Doing two separate "+="s seems to turn out better
+ code on gcc (as of 2.95.2 at least).
+
+ When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a
+ limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3
+ gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of
+ only 0 or 1 respectively. Hence the carry out of each stage and for the
+ return value is always only 0, 1 or 2. */
+
+mp_limb_t
+#if __STDC__
+mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c)
+#else
+mpn_divexact_by3c (dst, src, size, c)
+ mp_ptr dst;
+ mp_srcptr src;
+ mp_size_t size;
+ mp_limb_t c;
+#endif
+{
+ mp_size_t i;
+
+ ASSERT (size >= 1);
+
+ i = 0;
+ do
+ {
+ mp_limb_t l, s;
+
+ s = src[i];
+ l = s - c;
+ c = (l > s);
+
+ l *= INVERSE_3;
+ dst[i] = l;
+
+ c += (l > MP_LIMB_T_MAX/3);
+ c += (l > (MP_LIMB_T_MAX/3)*2);
+ }
+ while (++i < size);
+
+ return c;
+}
diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000000..30673e76d9
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem.c
@@ -0,0 +1,101 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+ quotient. This is now just a middle layer for calling the new
+ internal mpn_tdiv_qr.
+
+Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_divrem (mp_ptr qp, mp_size_t qxn,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn)
+#else
+mpn_divrem (qp, qxn, np, nn, dp, dn)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_ptr np;
+ mp_size_t nn;
+ mp_srcptr dp;
+ mp_size_t dn;
+#endif
+{
+ if (dn == 1)
+ {
+ mp_limb_t ret;
+ mp_ptr q2p;
+ mp_size_t qn;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+
+ np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
+ qn = nn + qxn - 1;
+ MPN_COPY (qp, q2p, qn);
+ ret = q2p[qn];
+
+ TMP_FREE (marker);
+ return ret;
+ }
+ else if (dn == 2)
+ {
+ return mpn_divrem_2 (qp, qxn, np, nn, dp);
+ }
+ else
+ {
+ mp_ptr rp, q2p;
+ mp_limb_t qhl;
+ mp_size_t qn;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ if (qxn != 0)
+ {
+ mp_ptr n2p;
+ n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+ MPN_ZERO (n2p, qxn);
+ MPN_COPY (n2p + qxn, np, nn);
+ q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
+ rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
+ MPN_COPY (np, rp, dn);
+ qn = nn - dn + qxn;
+ MPN_COPY (qp, q2p, qn);
+ qhl = q2p[qn];
+ }
+ else
+ {
+ q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
+ rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
+ MPN_COPY (np, rp, dn); /* overwrite np area with remainder */
+ qn = nn - dn;
+ MPN_COPY (qp, q2p, qn);
+ qhl = q2p[qn];
+ }
+ TMP_FREE (marker);
+ return qhl;
+ }
+}
diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000000..e93f241c9d
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_1.c
@@ -0,0 +1,248 @@
+/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ Return the single-limb remainder.
+ There are no constraints on the value of the divisor.
+
+ QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+
+/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb)
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ Return the single-limb remainder.
+ There are no constraints on the value of the divisor.
+
+ QUOT_PTR and DIVIDEND_PTR might point to the same limb. */
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+static mp_limb_t
+#if __STDC__
+__gmpn_divmod_1_internal (mp_ptr quot_ptr,
+ mp_srcptr dividend_ptr, mp_size_t dividend_size,
+ mp_limb_t divisor_limb)
+#else
+__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
+ mp_ptr quot_ptr;
+ mp_srcptr dividend_ptr;
+ mp_size_t dividend_size;
+ mp_limb_t divisor_limb;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t n1, n0, r;
+ int dummy;
+
+ /* ??? Should this be handled at all? Rely on callers? */
+ if (dividend_size == 0)
+ return 0;
+
+ /* If multiplication is much faster than division, and the
+ dividend is large, pre-invert the divisor, and use
+ only multiplications in the inner loop. */
+
+ /* This test should be read:
+ Does it ever help to use udiv_qrnnd_preinv?
+ && Does what we save compensate for the inversion overhead? */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6)
+ && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ divisor_limb <<= normalization_steps;
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (quot_ptr[i + 1], r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb, divisor_limb_inverted);
+ n1 = n0;
+ }
+ udiv_qrnnd_preinv (quot_ptr[0], r, r,
+ n1 << normalization_steps,
+ divisor_limb, divisor_limb_inverted);
+ return r >> normalization_steps;
+ }
+ else
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ {
+ quot_ptr[i] = 0;
+ i--;
+ }
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (quot_ptr[i], r, r,
+ n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+ }
+ }
+ else
+ {
+ if (UDIV_NEEDS_NORMALIZATION)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ divisor_limb <<= normalization_steps;
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (quot_ptr[i + 1], r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb);
+ n1 = n0;
+ }
+ udiv_qrnnd (quot_ptr[0], r, r,
+ n1 << normalization_steps,
+ divisor_limb);
+ return r >> normalization_steps;
+ }
+ }
+ /* No normalization needed, either because udiv_qrnnd doesn't require
+ it, or because DIVISOR_LIMB is already normalized. */
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ {
+ quot_ptr[i] = 0;
+ i--;
+ }
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb);
+ }
+ return r;
+ }
+}
+
+
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
+ mp_srcptr np, mp_size_t nn,
+ mp_limb_t d)
+#else
+mpn_divrem_1 (qp, qxn, np, nn, d)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_srcptr np;
+ mp_size_t nn;
+ mp_limb_t d;
+#endif
+{
+ mp_limb_t rlimb;
+ mp_size_t i;
+
+ /* Develop integer part of quotient. */
+ rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d);
+
+ /* Develop fraction part of quotient. This is not as fast as it should;
+ the preinvert stuff from __gmpn_divmod_1_internal ought to be used here
+ too. */
+ if (UDIV_NEEDS_NORMALIZATION)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, d);
+ if (normalization_steps != 0)
+ {
+ d <<= normalization_steps;
+ rlimb <<= normalization_steps;
+
+ for (i = qxn - 1; i >= 0; i--)
+ udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+ return rlimb >> normalization_steps;
+ }
+ else
+ /* fall out */
+ ;
+ }
+
+ for (i = qxn - 1; i >= 0; i--)
+ udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+ return rlimb;
+}
diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c
new file mode 100644
index 0000000000..0bc31ae2e7
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_2.c
@@ -0,0 +1,151 @@
+/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
+ quotient. The divisor is two limbs.
+
+ THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+ RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/2) and write
+ the NSIZE-2 least significant quotient limbs at QP
+ and the 2 long remainder at NP. If QEXTRA_LIMBS is
+ non-zero, generate that many fraction bits and append them after the
+ other quotient limbs.
+ Return the most significant limb of the quotient, this is always 0 or 1.
+
+ Preconditions:
+ 0. NSIZE >= 2.
+ 1. The most significant bit of the divisor must be set.
+ 2. QP must either not overlap with the input operands at all, or
+ QP + 2 >= NP must hold true. (This means that it's
+ possible to put the quotient in the high part of NUM, right after the
+ remainder in NUM.
+ 3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero. */
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+ mp_ptr np, mp_size_t nsize,
+ mp_srcptr dp)
+#else
+mpn_divrem_2 (qp, qxn, np, nsize, dp)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_ptr np;
+ mp_size_t nsize;
+ mp_srcptr dp;
+#endif
+{
+ mp_limb_t most_significant_q_limb = 0;
+ mp_size_t i;
+ mp_limb_t n1, n0, n2;
+ mp_limb_t d1, d0;
+ mp_limb_t d1inv;
+ int have_preinv;
+
+ np += nsize - 2;
+ d1 = dp[1];
+ d0 = dp[0];
+ n1 = np[1];
+ n0 = np[0];
+
+ if (n1 >= d1 && (n1 > d1 || n0 >= d0))
+ {
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
+ most_significant_q_limb = 1;
+ }
+
+ /* If multiplication is much faster than division, preinvert the most
+ significant divisor limb before entering the loop. */
+ if (UDIV_TIME > 2 * UMUL_TIME + 6)
+ {
+ have_preinv = 0;
+ if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME)
+ {
+ invert_limb (d1inv, d1);
+ have_preinv = 1;
+ }
+ }
+
+ for (i = qxn + nsize - 2 - 1; i >= 0; i--)
+ {
+ mp_limb_t q;
+ mp_limb_t r;
+
+ if (i >= qxn)
+ np--;
+ else
+ np[0] = 0;
+
+ if (n1 == d1)
+ {
+ /* Q should be either 111..111 or 111..110. Need special treatment
+ of this rare case as normal division would give overflow. */
+ q = ~(mp_limb_t) 0;
+
+ r = n0 + d1;
+ if (r < d1) /* Carry in the addition? */
+ {
+ add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+ qp[i] = q;
+ continue;
+ }
+ n1 = d0 - (d0 != 0);
+ n0 = -d0;
+ }
+ else
+ {
+ if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv)
+ udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
+ else
+ udiv_qrnnd (q, r, n1, n0, d1);
+ umul_ppmm (n1, n0, d0, q);
+ }
+
+ n2 = np[0];
+
+ q_test:
+ if (n1 > r || (n1 == r && n0 > n2))
+ {
+ /* The estimated Q was too large. */
+ q--;
+
+ sub_ddmmss (n1, n0, n1, n0, 0, d0);
+ r += d1;
+ if (r >= d1) /* If not carry, test Q again. */
+ goto q_test;
+ }
+
+ qp[i] = q;
+ sub_ddmmss (n1, n0, r, n2, n1, n0);
+ }
+ np[1] = n1;
+ np[0] = n0;
+
+ return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000000..66f375c74b
--- /dev/null
+++ b/rts/gmp/mpn/generic/dump.c
@@ -0,0 +1,76 @@
+/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO
+ CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+ FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpn_dump (mp_srcptr ptr, mp_size_t size)
+#else
+mpn_dump (ptr, size)
+ mp_srcptr ptr;
+ mp_size_t size;
+#endif
+{
+ MPN_NORMALIZE (ptr, size);
+
+ if (size == 0)
+ printf ("0\n");
+ else
+ {
+ size--;
+ if (BYTES_PER_MP_LIMB > sizeof (long))
+ {
+ if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0)
+ {
+ printf ("%lX",
+ (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+ printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+ (unsigned long) ptr[size]);
+ }
+ else
+ printf ("%lX", (unsigned long) ptr[size]);
+ }
+ else
+ printf ("%lX", ptr[size]);
+
+ while (size)
+ {
+ size--;
+ if (BYTES_PER_MP_LIMB > sizeof (long))
+ {
+ printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+ (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+ printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+ (unsigned long) ptr[size]);
+ }
+ else
+ printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]);
+ }
+ printf ("\n");
+ }
+}
diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000000..059e219a06
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd.c
@@ -0,0 +1,414 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Integer greatest common divisor of two unsigned integers, using
+ the accelerated algorithm (see reference below).
+
+ mp_size_t mpn_gcd (up, usize, vp, vsize).
+
+ Preconditions [U = (up, usize) and V = (vp, vsize)]:
+
+ 1. V is odd.
+ 2. numbits(U) >= numbits(V).
+
+ Both U and V are destroyed by the operation. The result is left at vp,
+ and its size is returned.
+
+ Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+ Funding for this work has been partially provided by Conselho Nacional
+ de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+ 301314194-2, and was done while I was a visiting reseacher in the Instituto
+ de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+ Refer to
+ K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+ Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
+ algorithm is used, otherwise the binary algorithm is used. This may be
+ adjusted for different architectures. */
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+
+/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
+ algorithm reduces using the bmod operation. Otherwise, the k-ary reduction
+ is used. 0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB. */
+enum
+ {
+ BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+ };
+
+
+/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
+ Both U and V must be odd. */
+static __gmp_inline mp_size_t
+#if __STDC__
+gcd_2 (mp_ptr vp, mp_srcptr up)
+#else
+gcd_2 (vp, up)
+ mp_ptr vp;
+ mp_srcptr up;
+#endif
+{
+ mp_limb_t u0, u1, v0, v1;
+ mp_size_t vsize;
+
+ u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+
+ while (u1 != v1 && u0 != v0)
+ {
+ unsigned long int r;
+ if (u1 > v1)
+ {
+ u1 -= v1 + (u0 < v0), u0 -= v0;
+ count_trailing_zeros (r, u0);
+ u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+ u1 >>= r;
+ }
+ else /* u1 < v1. */
+ {
+ v1 -= u1 + (v0 < u0), v0 -= u0;
+ count_trailing_zeros (r, v0);
+ v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+ v1 >>= r;
+ }
+ }
+
+ vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
+
+ /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
+ if (u1 == v1 && u0 == v0)
+ return vsize;
+
+ v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
+ vp[0] = mpn_gcd_1 (vp, vsize, v0);
+
+ return 1;
+}
+
+/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
+ 0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+ In the reference article, D was computed along with N, but it is better to
+ compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+ the result as a twos' complement signed integer.
+
+ Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB). According to the reference
+ article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
+ 2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+ precision. If N2 > N1 initially, the first iteration of the while loop
+ will swap them. In all other situations, N1 >= N2 is maintained. */
+
+static
+#if ! defined (__i386__)
+__gmp_inline /* don't inline this for the x86 */
+#endif
+mp_limb_t
+#if __STDC__
+find_a (mp_srcptr cp)
+#else
+find_a (cp)
+ mp_srcptr cp;
+#endif
+{
+ unsigned long int leading_zero_bits = 0;
+
+ mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l. */
+ mp_limb_t n1_h = cp[1];
+
+ mp_limb_t n2_l = -n1_l; /* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l. */
+ mp_limb_t n2_h = ~n1_h;
+
+ /* Main loop. */
+ while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */
+ {
+ /* N1 <-- N1 % N2. */
+ if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
+ {
+ unsigned long int i;
+ count_leading_zeros (i, n2_h);
+ i -= leading_zero_bits, leading_zero_bits += i;
+ n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+ do
+ {
+ if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+ n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+ n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+ i -= 1;
+ }
+ while (i);
+ }
+ if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+ n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+
+ MP_LIMB_T_SWAP (n1_h, n2_h);
+ MP_LIMB_T_SWAP (n1_l, n2_l);
+ }
+
+ return n2_l;
+}
+
+mp_size_t
+#if __STDC__
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, usize, vp, vsize)
+ mp_ptr gp;
+ mp_ptr up;
+ mp_size_t usize;
+ mp_ptr vp;
+ mp_size_t vsize;
+#endif
+{
+ mp_ptr orig_vp = vp;
+ mp_size_t orig_vsize = vsize;
+ int binary_gcd_ctr; /* Number of times binary gcd will execute. */
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+
+ /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
+ Two EXTRA limbs for U and V are required for kary reduction. */
+ if (vsize >= GCD_ACCEL_THRESHOLD)
+ {
+ unsigned long int vbitsize, d;
+ mp_ptr orig_up = up;
+ mp_size_t orig_usize = usize;
+ mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
+
+ MPN_COPY (anchor_up, orig_up, usize);
+ up = anchor_up;
+
+ count_leading_zeros (d, up[usize-1]);
+ d = usize * BITS_PER_MP_LIMB - d;
+ count_leading_zeros (vbitsize, vp[vsize-1]);
+ vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+ d = d - vbitsize + 1;
+
+ /* Use bmod reduction to quickly discover whether V divides U. */
+ up[usize++] = 0; /* Insert leading zero. */
+ mpn_bdivmod (up, up, usize, vp, vsize, d);
+
+ /* Now skip U/V mod 2^d and any low zero limbs. */
+ d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+ while (usize != 0 && up[0] == 0)
+ up++, usize--;
+
+ if (usize == 0) /* GCD == ORIG_V. */
+ goto done;
+
+ vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
+ MPN_COPY (vp, orig_vp, vsize);
+
+ do /* Main loop. */
+ {
+ /* mpn_com_n can't be used here because anchor_up and up may
+ partially overlap */
+ if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */
+ {
+ mp_size_t i;
+ anchor_up[0] = -up[0];
+ for (i = 1; i < usize; i++)
+ anchor_up[i] = ~up[i];
+ up = anchor_up;
+ }
+
+ MPN_NORMALIZE_NOT_ZERO (up, usize);
+
+ if ((up[0] & 1) == 0) /* Result even; remove twos. */
+ {
+ unsigned int r;
+ count_trailing_zeros (r, up[0]);
+ mpn_rshift (anchor_up, up, usize, r);
+ usize -= (anchor_up[usize-1] == 0);
+ }
+ else if (anchor_up != up)
+ MPN_COPY_INCR (anchor_up, up, usize);
+
+ MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
+ up = anchor_up;
+
+ if (vsize <= 2) /* Kary can't handle < 2 limbs and */
+ break; /* isn't efficient for == 2 limbs. */
+
+ d = vbitsize;
+ count_leading_zeros (vbitsize, vp[vsize-1]);
+ vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+ d = d - vbitsize + 1;
+
+ if (d > BMOD_THRESHOLD) /* Bmod reduction. */
+ {
+ up[usize++] = 0;
+ mpn_bdivmod (up, up, usize, vp, vsize, d);
+ d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+ }
+ else /* Kary reduction. */
+ {
+ mp_limb_t bp[2], cp[2];
+
+ /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */
+ {
+ mp_limb_t u_inv, hi, lo;
+ modlimb_invert (u_inv, up[0]);
+ cp[0] = vp[0] * u_inv;
+ umul_ppmm (hi, lo, cp[0], up[0]);
+ cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
+ }
+
+ /* U <-- find_a (C) * U. */
+ up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
+ usize++;
+
+ /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
+ bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
+ bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
+
+ Like V/U above, but simplified because only the low bit of
+ bp[1] is wanted. */
+ {
+ mp_limb_t v_inv, hi, lo;
+ modlimb_invert (v_inv, vp[0]);
+ bp[0] = up[0] * v_inv;
+ umul_ppmm (hi, lo, bp[0], vp[0]);
+ bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
+ }
+
+ up[usize++] = 0;
+ if (bp[1]) /* B < 0: U <-- U + (-B) * V. */
+ {
+ mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+ mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
+ }
+ else /* B >= 0: U <-- U - B * V. */
+ {
+ mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ }
+
+ up += 2, usize -= 2; /* At least two low limbs are zero. */
+ }
+
+ /* Must remove low zero limbs before complementing. */
+ while (usize != 0 && up[0] == 0)
+ up++, usize--;
+ }
+ while (usize);
+
+ /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */
+ up = orig_up, usize = orig_usize;
+ binary_gcd_ctr = 2;
+ }
+ else
+ binary_gcd_ctr = 1;
+
+ /* Finish up with the binary algorithm. Executes once or twice. */
+ for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+ {
+ if (usize > 2) /* First make U close to V in size. */
+ {
+ unsigned long int vbitsize, d;
+ count_leading_zeros (d, up[usize-1]);
+ d = usize * BITS_PER_MP_LIMB - d;
+ count_leading_zeros (vbitsize, vp[vsize-1]);
+ vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+ d = d - vbitsize - 1;
+ if (d != -(unsigned long int)1 && d > 2)
+ {
+ mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */
+ d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+ }
+ }
+
+ /* Start binary GCD. */
+ do
+ {
+ mp_size_t zeros;
+
+ /* Make sure U is odd. */
+ MPN_NORMALIZE (up, usize);
+ while (up[0] == 0)
+ up += 1, usize -= 1;
+ if ((up[0] & 1) == 0)
+ {
+ unsigned int r;
+ count_trailing_zeros (r, up[0]);
+ mpn_rshift (up, up, usize, r);
+ usize -= (up[usize-1] == 0);
+ }
+
+ /* Keep usize >= vsize. */
+ if (usize < vsize)
+ MPN_PTR_SWAP (up, usize, vp, vsize);
+
+ if (usize <= 2) /* Double precision. */
+ {
+ if (vsize == 1)
+ vp[0] = mpn_gcd_1 (up, usize, vp[0]);
+ else
+ vsize = gcd_2 (vp, up);
+ break; /* Binary GCD done. */
+ }
+
+ /* Count number of low zero limbs of U - V. */
+ for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
+ continue;
+
+ /* If U < V, swap U and V; in any case, subtract V from U. */
+ if (zeros == vsize) /* Subtract done. */
+ up += zeros, usize -= zeros;
+ else if (usize == vsize)
+ {
+ mp_size_t size = vsize;
+ do
+ size--;
+ while (up[size] == vp[size]);
+ if (up[size] < vp[size]) /* usize == vsize. */
+ MP_PTR_SWAP (up, vp);
+ up += zeros, usize = size + 1 - zeros;
+ mpn_sub_n (up, up, vp + zeros, usize);
+ }
+ else
+ {
+ mp_size_t size = vsize - zeros;
+ up += zeros, usize -= zeros;
+ if (mpn_sub_n (up, up, vp + zeros, size))
+ {
+ while (up[size] == 0) /* Propagate borrow. */
+ up[size++] = -(mp_limb_t)1;
+ up[size] -= 1;
+ }
+ }
+ }
+ while (usize); /* End binary GCD. */
+ }
+
+done:
+ if (vp != gp)
+ MPN_COPY (gp, vp, vsize);
+ TMP_FREE (marker);
+ return vsize;
+}
diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000000..1832636636
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd_1.c
@@ -0,0 +1,77 @@
+/* mpn_gcd_1 --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0. It would be tough to make it work for
+ V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. */
+
+mp_limb_t
+#if __STDC__
+mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
+#else
+mpn_gcd_1 (up, size, vlimb)
+ mp_srcptr up;
+ mp_size_t size;
+ mp_limb_t vlimb;
+#endif
+{
+ mp_limb_t ulimb;
+ unsigned long int u_low_zero_bits, v_low_zero_bits;
+
+ if (size > 1)
+ {
+ ulimb = mpn_mod_1 (up, size, vlimb);
+ if (ulimb == 0)
+ return vlimb;
+ }
+ else
+ ulimb = up[0];
+
+ /* Need to eliminate low zero bits. */
+ count_trailing_zeros (u_low_zero_bits, ulimb);
+ ulimb >>= u_low_zero_bits;
+
+ count_trailing_zeros (v_low_zero_bits, vlimb);
+ vlimb >>= v_low_zero_bits;
+
+ while (ulimb != vlimb)
+ {
+ if (ulimb > vlimb)
+ {
+ ulimb -= vlimb;
+ do
+ ulimb >>= 1;
+ while ((ulimb & 1) == 0);
+ }
+ else /* vlimb > ulimb. */
+ {
+ vlimb -= ulimb;
+ do
+ vlimb >>= 1;
+ while ((vlimb & 1) == 0);
+ }
+ }
+
+ return ulimb << MIN (u_low_zero_bits, v_low_zero_bits);
+}
diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000000..fe22d779a6
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcdext.c
@@ -0,0 +1,700 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 17
+#endif
+
+#ifndef EXTEND
+#define EXTEND 1
+#endif
+
+#if STAT
+int arr[BITS_PER_MP_LIMB];
+#endif
+
+
+/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
+
+ Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
+ greatest common divisor at GP (unless it is 0), and the first cofactor at
+ SP. Write the size of the cofactor through the pointer SSIZE. Return the
+ size of the value at GP. Note that SP might be a negative number; this is
+ denoted by storing the negative of the size through SSIZE.
+
+ {UP,USIZE} and {VP,VSIZE} are both clobbered.
+
+ The space allocation for all four areas needs to be USIZE+1.
+
+ Preconditions: 1) U >= V.
+ 2) V > 0. */
+
+/* We use Lehmer's algorithm. The idea is to extract the most significant
+ bits of the operands, and compute the continued fraction for them. We then
+ apply the gathered cofactors to the full operands.
+
+ Idea 1: After we have performed a full division, don't shift operands back,
+ but instead account for the extra factors-of-2 thus introduced.
+ Idea 2: Simple generalization to use divide-and-conquer would give us an
+ algorithm that runs faster than O(n^2).
+ Idea 3: The input numbers need less space as the computation progresses,
+ while the s0 and s1 variables need more space. To save memory, we
+ could make them share space, and have the latter variables grow
+ into the former.
+ Idea 4: We should not do double-limb arithmetic from the start. Instead,
+ do things in single-limb arithmetic until the quotients differ,
+ and then switch to double-limb arithmetic. */
+
+
+/* Division optimized for small quotients. If the quotient is more than one limb,
+ store 1 in *qh and return 0. */
+static mp_limb_t
+#if __STDC__
+div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
+#else
+div2 (qh, n1, n0, d1, d0)
+ mp_limb_t *qh;
+ mp_limb_t n1;
+ mp_limb_t n0;
+ mp_limb_t d1;
+ mp_limb_t d0;
+#endif
+{
+ if (d1 == 0)
+ {
+ *qh = 1;
+ return 0;
+ }
+
+ if ((mp_limb_signed_t) n1 < 0)
+ {
+ mp_limb_t q;
+ int cnt;
+ for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
+ {
+ d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+ d0 = d0 << 1;
+ }
+
+ q = 0;
+ while (cnt)
+ {
+ q <<= 1;
+ if (n1 > d1 || (n1 == d1 && n0 >= d0))
+ {
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
+ q |= 1;
+ }
+ d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+ d1 = d1 >> 1;
+ cnt--;
+ }
+
+ *qh = 0;
+ return q;
+ }
+ else
+ {
+ mp_limb_t q;
+ int cnt;
+ for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
+ {
+ d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+ d0 = d0 << 1;
+ }
+
+ q = 0;
+ while (cnt)
+ {
+ d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+ d1 = d1 >> 1;
+ q <<= 1;
+ if (n1 > d1 || (n1 == d1 && n0 >= d0))
+ {
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
+ q |= 1;
+ }
+ cnt--;
+ }
+
+ *qh = 0;
+ return q;
+ }
+}
+
+mp_size_t
+#if EXTEND
+#if __STDC__
+mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
+ mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
+ mp_ptr gp;
+ mp_ptr s0p;
+ mp_size_t *s0size;
+ mp_ptr up;
+ mp_size_t size;
+ mp_ptr vp;
+ mp_size_t vsize;
+#endif
+#else
+#if __STDC__
+mpn_gcd (mp_ptr gp,
+ mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, size, vp, vsize)
+ mp_ptr gp;
+ mp_ptr up;
+ mp_size_t size;
+ mp_ptr vp;
+ mp_size_t vsize;
+#endif
+#endif
+{
+ mp_limb_t A, B, C, D;
+ int cnt;
+ mp_ptr tp, wp;
+#if RECORD
+ mp_limb_t max = 0;
+#endif
+#if EXTEND
+ mp_ptr s1p;
+ mp_ptr orig_s0p = s0p;
+ mp_size_t ssize;
+ int sign = 1;
+#endif
+ int use_double_flag;
+ TMP_DECL (mark);
+
+ TMP_MARK (mark);
+
+ use_double_flag = (size >= GCDEXT_THRESHOLD);
+
+ tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+ wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+#if EXTEND
+ s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+
+ MPN_ZERO (s0p, size);
+ MPN_ZERO (s1p, size);
+
+ s0p[0] = 1;
+ s1p[0] = 0;
+ ssize = 1;
+#endif
+
+ if (size > vsize)
+ {
+ /* Normalize V (and shift up U the same amount). */
+ count_leading_zeros (cnt, vp[vsize - 1]);
+ if (cnt != 0)
+ {
+ mp_limb_t cy;
+ mpn_lshift (vp, vp, vsize, cnt);
+ cy = mpn_lshift (up, up, size, cnt);
+ up[size] = cy;
+ size += cy != 0;
+ }
+
+ mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+ /* This is really what it boils down to in this case... */
+ s0p[0] = 0;
+ s1p[0] = 1;
+ sign = -sign;
+#endif
+ size = vsize;
+ if (cnt != 0)
+ {
+ mpn_rshift (up, up, size, cnt);
+ mpn_rshift (vp, vp, size, cnt);
+ }
+ MP_PTR_SWAP (up, vp);
+ }
+
+ for (;;)
+ {
+ mp_limb_t asign;
+ /* Figure out exact size of V. */
+ vsize = size;
+ MPN_NORMALIZE (vp, vsize);
+ if (vsize <= 1)
+ break;
+
+ if (use_double_flag)
+ {
+ mp_limb_t uh, vh, ul, vl;
+ /* Let UH,UL be the most significant limbs of U, and let VH,VL be
+ the corresponding bits from V. */
+ uh = up[size - 1];
+ vh = vp[size - 1];
+ ul = up[size - 2];
+ vl = vp[size - 2];
+ count_leading_zeros (cnt, uh);
+ if (cnt != 0)
+ {
+ uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
+ vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
+ vl <<= cnt;
+ ul <<= cnt;
+ if (size >= 3)
+ {
+ ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+ vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+ }
+ }
+
+ A = 1;
+ B = 0;
+ C = 0;
+ D = 1;
+
+ asign = 0;
+ for (;;)
+ {
+ mp_limb_t T;
+ mp_limb_t qh, q1, q2;
+ mp_limb_t nh, nl, dh, dl;
+ mp_limb_t t1, t0;
+ mp_limb_t Th, Tl;
+
+ sub_ddmmss (dh, dl, vh, vl, 0, C);
+ if ((dl | dh) == 0)
+ break;
+ add_ssaaaa (nh, nl, uh, ul, 0, A);
+ q1 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ add_ssaaaa (dh, dl, vh, vl, 0, D);
+ if ((dl | dh) == 0)
+ break;
+ sub_ddmmss (nh, nl, uh, ul, 0, B);
+ q2 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ if (q1 != q2)
+ break;
+
+ asign = ~asign;
+
+ T = A + q1 * C;
+ A = C;
+ C = T;
+ T = B + q1 * D;
+ B = D;
+ D = T;
+ umul_ppmm (t1, t0, q1, vl);
+ t1 += q1 * vh;
+ sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+ uh = vh, ul = vl;
+ vh = Th, vl = Tl;
+
+ add_ssaaaa (dh, dl, vh, vl, 0, C);
+ sub_ddmmss (nh, nl, uh, ul, 0, A);
+ q1 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ sub_ddmmss (dh, dl, vh, vl, 0, D);
+ if ((dl | dh) == 0)
+ break;
+ add_ssaaaa (nh, nl, uh, ul, 0, B);
+ q2 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ if (q1 != q2)
+ break;
+
+ asign = ~asign;
+
+ T = A + q1 * C;
+ A = C;
+ C = T;
+ T = B + q1 * D;
+ B = D;
+ D = T;
+ umul_ppmm (t1, t0, q1, vl);
+ t1 += q1 * vh;
+ sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+ uh = vh, ul = vl;
+ vh = Th, vl = Tl;
+ }
+#if EXTEND
+ if (asign)
+ sign = -sign;
+#endif
+ }
+ else /* Same, but using single-limb calculations. */
+ {
+ mp_limb_t uh, vh;
+ /* Make UH be the most significant limb of U, and make VH be
+ corresponding bits from V. */
+ uh = up[size - 1];
+ vh = vp[size - 1];
+ count_leading_zeros (cnt, uh);
+ if (cnt != 0)
+ {
+ uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+ vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+ }
+
+ A = 1;
+ B = 0;
+ C = 0;
+ D = 1;
+
+ asign = 0;
+ for (;;)
+ {
+ mp_limb_t q, T;
+ if (vh - C == 0 || vh + D == 0)
+ break;
+
+ q = (uh + A) / (vh - C);
+ if (q != (uh - B) / (vh + D))
+ break;
+
+ asign = ~asign;
+
+ T = A + q * C;
+ A = C;
+ C = T;
+ T = B + q * D;
+ B = D;
+ D = T;
+ T = uh - q * vh;
+ uh = vh;
+ vh = T;
+
+ if (vh - D == 0)
+ break;
+
+ q = (uh - A) / (vh + C);
+ if (q != (uh + B) / (vh - D))
+ break;
+
+ asign = ~asign;
+
+ T = A + q * C;
+ A = C;
+ C = T;
+ T = B + q * D;
+ B = D;
+ D = T;
+ T = uh - q * vh;
+ uh = vh;
+ vh = T;
+ }
+#if EXTEND
+ if (asign)
+ sign = -sign;
+#endif
+ }
+
+#if RECORD
+ max = MAX (A, max); max = MAX (B, max);
+ max = MAX (C, max); max = MAX (D, max);
+#endif
+
+ if (B == 0)
+ {
+ mp_limb_t qh;
+ mp_size_t i;
+ /* This is quite rare. I.e., optimize something else! */
+
+ /* Normalize V (and shift up U the same amount). */
+ count_leading_zeros (cnt, vp[vsize - 1]);
+ if (cnt != 0)
+ {
+ mp_limb_t cy;
+ mpn_lshift (vp, vp, vsize, cnt);
+ cy = mpn_lshift (up, up, size, cnt);
+ up[size] = cy;
+ size += cy != 0;
+ }
+
+ qh = mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+ MPN_COPY (tp, s0p, ssize);
+ {
+ mp_size_t qsize;
+
+ qsize = size - vsize; /* size of stored quotient from division */
+ if (ssize < qsize)
+ {
+ MPN_ZERO (tp + ssize, qsize - ssize);
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < ssize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
+ tp[qsize + i] = cy;
+ }
+ if (qh != 0)
+ {
+ mp_limb_t cy;
+ cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+ if (cy != 0)
+ abort ();
+ }
+ }
+ else
+ {
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < qsize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
+ tp[ssize + i] = cy;
+ }
+ if (qh != 0)
+ {
+ mp_limb_t cy;
+ cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+ if (cy != 0)
+ {
+ tp[qsize + ssize] = cy;
+ s1p[qsize + ssize] = 0;
+ ssize++;
+ }
+ }
+ }
+ ssize += qsize;
+ ssize -= tp[ssize - 1] == 0;
+ }
+
+ sign = -sign;
+ MP_PTR_SWAP (s0p, s1p);
+ MP_PTR_SWAP (s1p, tp);
+#endif
+ size = vsize;
+ if (cnt != 0)
+ {
+ mpn_rshift (up, up, size, cnt);
+ mpn_rshift (vp, vp, size, cnt);
+ }
+ MP_PTR_SWAP (up, vp);
+ }
+ else
+ {
+#if EXTEND
+ mp_size_t tsize, wsize;
+#endif
+ /* T = U*A + V*B
+ W = U*C + V*D
+ U = T
+ V = W */
+
+#if STAT
+ { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
+ arr[BITS_PER_MP_LIMB - cnt]++; }
+#endif
+ if (A == 0)
+ {
+ /* B == 1 and C == 1 (D is arbitrary) */
+ mp_limb_t cy;
+ MPN_COPY (tp, vp, size);
+ MPN_COPY (wp, up, size);
+ mpn_submul_1 (wp, vp, size, D);
+ MP_PTR_SWAP (tp, up);
+ MP_PTR_SWAP (wp, vp);
+#if EXTEND
+ MPN_COPY (tp, s1p, ssize);
+ tsize = ssize;
+ tp[ssize] = 0; /* must zero since wp might spill below */
+ MPN_COPY (wp, s0p, ssize);
+ cy = mpn_addmul_1 (wp, s1p, ssize, D);
+ wp[ssize] = cy;
+ wsize = ssize + (cy != 0);
+ MP_PTR_SWAP (tp, s0p);
+ MP_PTR_SWAP (wp, s1p);
+ ssize = MAX (wsize, tsize);
+#endif
+ }
+ else
+ {
+ if (asign)
+ {
+ mp_limb_t cy;
+ mpn_mul_1 (tp, vp, size, B);
+ mpn_submul_1 (tp, up, size, A);
+ mpn_mul_1 (wp, up, size, C);
+ mpn_submul_1 (wp, vp, size, D);
+ MP_PTR_SWAP (tp, up);
+ MP_PTR_SWAP (wp, vp);
+#if EXTEND
+ cy = mpn_mul_1 (tp, s1p, ssize, B);
+ cy += mpn_addmul_1 (tp, s0p, ssize, A);
+ tp[ssize] = cy;
+ tsize = ssize + (cy != 0);
+ cy = mpn_mul_1 (wp, s0p, ssize, C);
+ cy += mpn_addmul_1 (wp, s1p, ssize, D);
+ wp[ssize] = cy;
+ wsize = ssize + (cy != 0);
+ MP_PTR_SWAP (tp, s0p);
+ MP_PTR_SWAP (wp, s1p);
+ ssize = MAX (wsize, tsize);
+#endif
+ }
+ else
+ {
+ mp_limb_t cy;
+ mpn_mul_1 (tp, up, size, A);
+ mpn_submul_1 (tp, vp, size, B);
+ mpn_mul_1 (wp, vp, size, D);
+ mpn_submul_1 (wp, up, size, C);
+ MP_PTR_SWAP (tp, up);
+ MP_PTR_SWAP (wp, vp);
+#if EXTEND
+ cy = mpn_mul_1 (tp, s0p, ssize, A);
+ cy += mpn_addmul_1 (tp, s1p, ssize, B);
+ tp[ssize] = cy;
+ tsize = ssize + (cy != 0);
+ cy = mpn_mul_1 (wp, s1p, ssize, D);
+ cy += mpn_addmul_1 (wp, s0p, ssize, C);
+ wp[ssize] = cy;
+ wsize = ssize + (cy != 0);
+ MP_PTR_SWAP (tp, s0p);
+ MP_PTR_SWAP (wp, s1p);
+ ssize = MAX (wsize, tsize);
+#endif
+ }
+ }
+
+ size -= up[size - 1] == 0;
+ }
+ }
+
+#if RECORD
+ printf ("max: %lx\n", max);
+#endif
+
+#if STAT
+ {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
+#endif
+
+ if (vsize == 0)
+ {
+ if (gp != up && gp != 0)
+ MPN_COPY (gp, up, size);
+#if EXTEND
+ MPN_NORMALIZE (s0p, ssize);
+ if (orig_s0p != s0p)
+ MPN_COPY (orig_s0p, s0p, ssize);
+ *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+ TMP_FREE (mark);
+ return size;
+ }
+ else
+ {
+ mp_limb_t vl, ul, t;
+#if EXTEND
+ mp_size_t qsize, i;
+#endif
+ vl = vp[0];
+#if EXTEND
+ t = mpn_divmod_1 (wp, up, size, vl);
+
+ MPN_COPY (tp, s0p, ssize);
+
+ qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
+ if (ssize < qsize)
+ {
+ MPN_ZERO (tp + ssize, qsize - ssize);
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < ssize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
+ tp[qsize + i] = cy;
+ }
+ }
+ else
+ {
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < qsize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
+ tp[ssize + i] = cy;
+ }
+ }
+ ssize += qsize;
+ ssize -= tp[ssize - 1] == 0;
+
+ sign = -sign;
+ MP_PTR_SWAP (s0p, s1p);
+ MP_PTR_SWAP (s1p, tp);
+#else
+ t = mpn_mod_1 (up, size, vl);
+#endif
+ ul = vl;
+ vl = t;
+ while (vl != 0)
+ {
+ mp_limb_t t;
+#if EXTEND
+ mp_limb_t q;
+ q = ul / vl;
+ t = ul - q * vl;
+
+ MPN_COPY (tp, s0p, ssize);
+
+ MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
+
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp, s1p, ssize, q);
+ tp[ssize] = cy;
+ }
+
+ ssize += 1;
+ ssize -= tp[ssize - 1] == 0;
+
+ sign = -sign;
+ MP_PTR_SWAP (s0p, s1p);
+ MP_PTR_SWAP (s1p, tp);
+#else
+ t = ul % vl;
+#endif
+ ul = vl;
+ vl = t;
+ }
+ if (gp != 0)
+ gp[0] = ul;
+#if EXTEND
+ MPN_NORMALIZE (s0p, ssize);
+ if (orig_s0p != s0p)
+ MPN_COPY (orig_s0p, s0p, ssize);
+ *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+ TMP_FREE (mark);
+ return 1;
+ }
+}
diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c
new file mode 100644
index 0000000000..a713b61825
--- /dev/null
+++ b/rts/gmp/mpn/generic/get_str.c
@@ -0,0 +1,216 @@
+/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR
+ to a printable string in STR in base BASE.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Convert the limb vector pointed to by MPTR and MSIZE long to a
+ char array, using base BASE for the result array. Store the
+ result in the character array STR. STR must point to an array with
+ space for the largest possible number represented by a MSIZE long
+ limb vector + 1 extra character.
+
+ The result is NOT in Ascii, to convert it to printable format, add
+ '0' or 'A' depending on the base and range.
+
+ Return the number of digits in the result string.
+ This may include some leading zeros.
+
+ The limb vector pointed to by MPTR is clobbered. */
+
+size_t
+#if __STDC__
+mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize)
+#else
+mpn_get_str (str, base, mptr, msize)
+ unsigned char *str;
+ int base;
+ mp_ptr mptr;
+ mp_size_t msize;
+#endif
+{
+ mp_limb_t big_base;
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ int normalization_steps;
+#endif
+#if UDIV_TIME > 2 * UMUL_TIME
+ mp_limb_t big_base_inverted;
+#endif
+ unsigned int dig_per_u;
+ mp_size_t out_len;
+ register unsigned char *s;
+
+ big_base = __mp_bases[base].big_base;
+
+ s = str;
+
+ /* Special case zero, as the code below doesn't handle it. */
+ if (msize == 0)
+ {
+ s[0] = 0;
+ return 1;
+ }
+
+ if ((base & (base - 1)) == 0)
+ {
+ /* The base is a power of 2. Make conversion from most
+ significant side. */
+ mp_limb_t n1, n0;
+ register int bits_per_digit = big_base;
+ register int x;
+ register int bit_pos;
+ register int i;
+
+ n1 = mptr[msize - 1];
+ count_leading_zeros (x, n1);
+
+ /* BIT_POS should be R when input ends in least sign. nibble,
+ R + bits_per_digit * n when input ends in n:th least significant
+ nibble. */
+
+ {
+ int bits;
+
+ bits = BITS_PER_MP_LIMB * msize - x;
+ x = bits % bits_per_digit;
+ if (x != 0)
+ bits += bits_per_digit - x;
+ bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB;
+ }
+
+ /* Fast loop for bit output. */
+ i = msize - 1;
+ for (;;)
+ {
+ bit_pos -= bits_per_digit;
+ while (bit_pos >= 0)
+ {
+ *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
+ bit_pos -= bits_per_digit;
+ }
+ i--;
+ if (i < 0)
+ break;
+ n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
+ n1 = mptr[i];
+ bit_pos += BITS_PER_MP_LIMB;
+ *s++ = n0 | (n1 >> bit_pos);
+ }
+
+ *s = 0;
+
+ return s - str;
+ }
+ else
+ {
+ /* General case. The base is not a power of 2. Make conversion
+ from least significant end. */
+
+ /* If udiv_qrnnd only handles divisors with the most significant bit
+ set, prepare BIG_BASE for being a divisor by shifting it to the
+ left exactly enough to set the most significant bit. */
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ count_leading_zeros (normalization_steps, big_base);
+ big_base <<= normalization_steps;
+#if UDIV_TIME > 2 * UMUL_TIME
+ /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS). */
+ big_base_inverted = __mp_bases[base].big_base_inverted;
+#endif
+#endif
+
+ dig_per_u = __mp_bases[base].chars_per_limb;
+ out_len = ((size_t) msize * BITS_PER_MP_LIMB
+ * __mp_bases[base].chars_per_bit_exactly) + 1;
+ s += out_len;
+
+ while (msize != 0)
+ {
+ int i;
+ mp_limb_t n0, n1;
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ /* If we shifted BIG_BASE above, shift the dividend too, to get
+ the right quotient. We need to do this every loop,
+ since the intermediate quotients are OK, but the quotient from
+ one turn in the loop is going to be the dividend in the
+ next turn, and the dividend needs to be up-shifted. */
+ if (normalization_steps != 0)
+ {
+ n0 = mpn_lshift (mptr, mptr, msize, normalization_steps);
+
+ /* If the shifting gave a carry out limb, store it and
+ increase the length. */
+ if (n0 != 0)
+ {
+ mptr[msize] = n0;
+ msize++;
+ }
+ }
+#endif
+
+ /* Divide the number at TP with BIG_BASE to get a quotient and a
+ remainder. The remainder is our new digit in base BIG_BASE. */
+ i = msize - 1;
+ n1 = mptr[i];
+
+ if (n1 >= big_base)
+ n1 = 0;
+ else
+ {
+ msize--;
+ i--;
+ }
+
+ for (; i >= 0; i--)
+ {
+ n0 = mptr[i];
+#if UDIV_TIME > 2 * UMUL_TIME
+ udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted);
+#else
+ udiv_qrnnd (mptr[i], n1, n1, n0, big_base);
+#endif
+ }
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests)
+ the remainder will be up-shifted here. Compensate. */
+ n1 >>= normalization_steps;
+#endif
+
+ /* Convert N1 from BIG_BASE to a string of digits in BASE
+ using single precision operations. */
+ for (i = dig_per_u - 1; i >= 0; i--)
+ {
+ *--s = n1 % base;
+ n1 /= base;
+ if (n1 == 0 && msize == 0)
+ break;
+ }
+ }
+
+ while (s != str)
+ *--s = 0;
+ return out_len;
+ }
+}
diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h
new file mode 100644
index 0000000000..14bcaece83
--- /dev/null
+++ b/rts/gmp/mpn/generic/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c
new file mode 100644
index 0000000000..35c10e8450
--- /dev/null
+++ b/rts/gmp/mpn/generic/hamdist.c
@@ -0,0 +1,94 @@
+/* mpn_hamdist --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+ implement the POPC instruction. Disable pattern for now. */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+ ({ \
+ DItype __res; \
+ asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \
+ __res; \
+ })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+ You have to figure out how this works, I won't tell you! */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+ mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+ /* We have to go into some trouble to define these constants.
+ (For mp_limb_t being `long long'.) */
+ mp_limb_t cnst;
+ cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+ x -= (x & cnst) >> 1;
+ cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+ x = ((x & ~cnst) >> 2) + (x & cnst);
+ cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+ x = ((x >> 4) + x) & cnst;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x);
+ x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+ x -= (x & 0xaaaaaaaa) >> 1;
+ x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+ x = ((x >> 4) + x) & 0x0f0f0f0fL;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x) & 0xff;
+#endif
+ return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size)
+#else
+mpn_hamdist (up, vp, size)
+ register mp_srcptr up;
+ register mp_srcptr vp;
+ register mp_size_t size;
+#endif
+{
+ unsigned long int hamdist;
+ mp_size_t i;
+
+ hamdist = 0;
+ for (i = 0; i < size; i++)
+ hamdist += popc_limb (up[i] ^ vp[i]);
+
+ return hamdist;
+}
diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c
new file mode 100644
index 0000000000..9487e58cf2
--- /dev/null
+++ b/rts/gmp/mpn/generic/inlines.c
@@ -0,0 +1,24 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#define _FORCE_INLINES
+#define _EXTERN_INLINE /* empty */
+#include "gmp.h"
diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c
new file mode 100644
index 0000000000..dd437f1ac1
--- /dev/null
+++ b/rts/gmp/mpn/generic/jacbase.c
@@ -0,0 +1,136 @@
+/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments.
+
+ THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
+ INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if COUNT_TRAILING_ZEROS_TIME <= 7
+/* If count_trailing_zeros is fast, use it.
+ K7 at 7 cycles and P6 at 2 are good here. K6 at 12-27 and P5 at 18-42
+ are not. The default 15 in longlong.h is meant to mean not good here. */
+
+#define PROCESS_TWOS_ANY \
+ { \
+ mp_limb_t twos; \
+ count_trailing_zeros (twos, a); \
+ result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \
+ a >>= twos; \
+ }
+
+#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY
+
+#else
+/* Use a loop instead. With "a" uniformly distributed there will usually be
+ only a few trailing zeros.
+
+ Unfortunately the branch for the while loop here will be on a 50/50
+ chance of a 1 or 0, which is bad for branch prediction. */
+
+#define PROCESS_TWOS_EVEN \
+ { \
+ int two; \
+ two = JACOBI_TWO_U_BIT1 (b); \
+ do \
+ { \
+ a >>= 1; \
+ result_bit1 ^= two; \
+ ASSERT (a != 0); \
+ } \
+ while ((a & 1) == 0); \
+ }
+
+#define PROCESS_TWOS_ANY \
+ if ((a & 1) == 0) \
+ PROCESS_TWOS_EVEN;
+
+#endif
+
+
+/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
+ with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
+
+ The initial result_bit1 is taken as a parameter for the convenience of
+ mpz_kronecker_zi_ui() et al. The sign changes both here and in those
+ routines accumulate nicely in bit 1, see the JACOBI macros.
+
+ The return value here is the normal +1, 0, or -1. Note that +1 and -1
+ have bit 1 in the "BIT1" sense, which could be useful if the caller is
+ accumulating it into some extended calculation.
+
+ Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
+ possible, but a couple of tests suggest it's not a significant speedup,
+ and may even be a slowdown, so what's here is good enough for now.
+
+ Future: The code doesn't demand a<=b actually, so maybe this could be
+ relaxed. All the places this is used currently call with a<=b though. */
+
+int
+#if __STDC__
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
+#else
+mpn_jacobi_base (a, b, result_bit1)
+ mp_limb_t a;
+ mp_limb_t b;
+ int result_bit1;
+#endif
+{
+ ASSERT (b & 1); /* b odd */
+ ASSERT (b != 1);
+ ASSERT (a <= b);
+
+ if (a == 0)
+ return 0;
+
+ PROCESS_TWOS_ANY;
+ if (a == 1)
+ goto done;
+
+ for (;;)
+ {
+ result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
+ MP_LIMB_T_SWAP (a, b);
+
+ do
+ {
+ /* working on (a/b), a,b odd, a>=b */
+ ASSERT (a & 1);
+ ASSERT (b & 1);
+ ASSERT (a >= b);
+
+ if ((a -= b) == 0)
+ return 0;
+
+ PROCESS_TWOS_EVEN;
+ if (a == 1)
+ goto done;
+ }
+ while (a >= b);
+ }
+
+ done:
+ return JACOBI_BIT1_TO_PN (result_bit1);
+}
diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c
new file mode 100644
index 0000000000..0b58389658
--- /dev/null
+++ b/rts/gmp/mpn/generic/lshift.c
@@ -0,0 +1,87 @@
+/* mpn_lshift -- Shift left low level.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+ and store the USIZE least significant digits of the result at WP.
+ Return the bits shifted out from the most significant digit.
+
+ Argument constraints:
+ 1. 0 < CNT < BITS_PER_MP_LIMB
+ 2. If the result is to be written over the input, WP must be >= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_lshift (register mp_ptr wp,
+ register mp_srcptr up, mp_size_t usize,
+ register unsigned int cnt)
+#else
+mpn_lshift (wp, up, usize, cnt)
+ register mp_ptr wp;
+ register mp_srcptr up;
+ mp_size_t usize;
+ register unsigned int cnt;
+#endif
+{
+ register mp_limb_t high_limb, low_limb;
+ register unsigned sh_1, sh_2;
+ register mp_size_t i;
+ mp_limb_t retval;
+
+#ifdef DEBUG
+ if (usize == 0 || cnt == 0)
+ abort ();
+#endif
+
+ sh_1 = cnt;
+#if 0
+ if (sh_1 == 0)
+ {
+ if (wp != up)
+ {
+ /* Copy from high end to low end, to allow specified input/output
+ overlapping. */
+ for (i = usize - 1; i >= 0; i--)
+ wp[i] = up[i];
+ }
+ return 0;
+ }
+#endif
+
+ wp += 1;
+ sh_2 = BITS_PER_MP_LIMB - sh_1;
+ i = usize - 1;
+ low_limb = up[i];
+ retval = low_limb >> sh_2;
+ high_limb = low_limb;
+ while (--i >= 0)
+ {
+ low_limb = up[i];
+ wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+ high_limb = low_limb;
+ }
+ wp[i] = high_limb << sh_1;
+
+ return retval;
+}
diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c
new file mode 100644
index 0000000000..168ec9df49
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1.c
@@ -0,0 +1,175 @@
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ Return the single-limb remainder.
+ There are no constraints on the value of the divisor.
+
+Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+ mp_limb_t divisor_limb)
+#else
+mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
+ mp_srcptr dividend_ptr;
+ mp_size_t dividend_size;
+ mp_limb_t divisor_limb;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t n1, n0, r;
+ int dummy;
+
+ /* Botch: Should this be handled at all? Rely on callers? */
+ if (dividend_size == 0)
+ return 0;
+
+ /* If multiplication is much faster than division, and the
+ dividend is large, pre-invert the divisor, and use
+ only multiplications in the inner loop. */
+
+ /* This test should be read:
+ Does it ever help to use udiv_qrnnd_preinv?
+ && Does what we save compensate for the inversion overhead? */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6)
+ && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ divisor_limb <<= normalization_steps;
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (dummy, r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb, divisor_limb_inverted);
+ n1 = n0;
+ }
+ udiv_qrnnd_preinv (dummy, r, r,
+ n1 << normalization_steps,
+ divisor_limb, divisor_limb_inverted);
+ return r >> normalization_steps;
+ }
+ else
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (dummy, r, r,
+ n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+ }
+ }
+ else
+ {
+ if (UDIV_NEEDS_NORMALIZATION)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ divisor_limb <<= normalization_steps;
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (dummy, r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb);
+ n1 = n0;
+ }
+ udiv_qrnnd (dummy, r, r,
+ n1 << normalization_steps,
+ divisor_limb);
+ return r >> normalization_steps;
+ }
+ }
+ /* No normalization needed, either because udiv_qrnnd doesn't require
+ it, or because DIVISOR_LIMB is already normalized. */
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (dummy, r, r, n0, divisor_limb);
+ }
+ return r;
+ }
+}
diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c
new file mode 100644
index 0000000000..62aaa94b92
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1_rs.c
@@ -0,0 +1,111 @@
+/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift.
+
+ THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE
+ INTERFACE. IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.
+ IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+ RELEASE. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be
+ changed to 1 temporarily to test the code under that case too. */
+#if 0
+#undef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 1
+#endif
+
+
+/* Calculate the remainder "(ptr,size >> shift) % divisor". Note ptr,size
+ is unchanged, the shift is only for its effect on the remainder.
+ The shift doesn't even need to be considered until the last limb.
+
+ This function has the normal size!=0 restriction, unlike the basic
+ mpn_mod_1. */
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift,
+ mp_limb_t divisor)
+#else
+mpn_mod_1_rshift (ptr, size, shift, divisor)
+ mp_srcptr ptr;
+ mp_size_t size;
+ unsigned shift;
+ mp_limb_t divisor;
+#endif
+{
+ mp_limb_t quot, rem;
+
+ ASSERT (shift >= 1);
+ ASSERT (shift < BITS_PER_MP_LIMB);
+ ASSERT (size >= 1);
+
+ if (size == 1)
+ return (ptr[0] >> shift) % divisor;
+
+#if UDIV_NEEDS_NORMALIZATION
+ {
+ int norm;
+ int delta;
+
+ count_leading_zeros (norm, divisor);
+ divisor <<= norm;
+
+ delta = shift - norm;
+ if (delta == 0)
+ return mpn_mod_1 (ptr, size, divisor) >> norm;
+
+ if (delta > 0)
+ {
+ rem = mpn_mod_1 (ptr+1, size-1, divisor);
+ udiv_qrnnd (quot, rem,
+ rem >> delta,
+ (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta),
+ divisor);
+ return rem >> norm;
+ }
+ else
+ {
+ rem = mpn_mod_1 (ptr, size, divisor);
+ udiv_qrnnd (quot, rem,
+ rem >> (BITS_PER_MP_LIMB+delta),
+ rem << -delta,
+ divisor);
+ return rem >> norm;
+ }
+ }
+
+#else /* !UDIV_NEEDS_NORMALIZATION */
+
+ rem = mpn_mod_1 (ptr+1, size-1, divisor);
+ udiv_qrnnd (quot, rem,
+ rem >> shift,
+ (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift),
+ divisor);
+ return rem;
+
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000000..cecfa19ca1
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul.c
@@ -0,0 +1,190 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+ THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul)
+ ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
+ THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
+ THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+ (pointed to by VP, with VN limbs), and store the result at PRODP. The
+ result is UN + VN limbs. Return the most significant limb of the result.
+
+ NOTE: The space pointed to by PRODP is overwritten before finished with U
+ and V, so overlap is an error.
+
+ Argument constraints:
+ 1. UN >= VN.
+ 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+ the multiplier and the multiplicand. */
+
+void
+#if __STDC__
+mpn_sqr_n (mp_ptr prodp,
+ mp_srcptr up, mp_size_t un)
+#else
+mpn_sqr_n (prodp, up, un)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t un;
+#endif
+{
+ if (un < KARATSUBA_SQR_THRESHOLD)
+ { /* plain schoolbook multiplication */
+ if (un == 0)
+ return;
+ mpn_sqr_basecase (prodp, up, un);
+ }
+ else if (un < TOOM3_SQR_THRESHOLD)
+ { /* karatsuba multiplication */
+ mp_ptr tspace;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+ mpn_kara_sqr_n (prodp, up, un, tspace);
+ TMP_FREE (marker);
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else if (un < FFT_SQR_THRESHOLD)
+#else
+ else
+#endif
+ { /* toom3 multiplication */
+ mp_ptr tspace;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+ mpn_toom3_sqr_n (prodp, up, un, tspace);
+ TMP_FREE (marker);
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else
+ {
+ /* schoenhage multiplication */
+ mpn_mul_fft_full (prodp, up, un, up, un);
+ }
+#endif
+}
+
+mp_limb_t
+#if __STDC__
+mpn_mul (mp_ptr prodp,
+ mp_srcptr up, mp_size_t un,
+ mp_srcptr vp, mp_size_t vn)
+#else
+mpn_mul (prodp, up, un, vp, vn)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t un;
+ mp_srcptr vp;
+ mp_size_t vn;
+#endif
+{
+ mp_size_t l;
+ mp_limb_t c;
+
+ if (up == vp && un == vn)
+ {
+ mpn_sqr_n (prodp, up, un);
+ return prodp[2 * un - 1];
+ }
+
+ if (vn < KARATSUBA_MUL_THRESHOLD)
+ { /* long multiplication */
+ mpn_mul_basecase (prodp, up, un, vp, vn);
+ return prodp[un + vn - 1];
+ }
+
+ mpn_mul_n (prodp, up, vp, vn);
+ if (un != vn)
+ { mp_limb_t t;
+ mp_ptr ws;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ prodp += vn;
+ l = vn;
+ up += vn;
+ un -= vn;
+
+ if (un < vn)
+ {
+ /* Swap u's and v's. */
+ MPN_SRCPTR_SWAP (up,un, vp,vn);
+ }
+
+ ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn)
+ * BYTES_PER_MP_LIMB);
+
+ t = 0;
+ while (vn >= KARATSUBA_MUL_THRESHOLD)
+ {
+ mpn_mul_n (ws, up, vp, vn);
+ if (l <= 2*vn)
+ {
+ t += mpn_add_n (prodp, prodp, ws, l);
+ if (l != 2*vn)
+ {
+ t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
+ l = 2*vn;
+ }
+ }
+ else
+ {
+ c = mpn_add_n (prodp, prodp, ws, 2*vn);
+ t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
+ }
+ prodp += vn;
+ l -= vn;
+ up += vn;
+ un -= vn;
+ if (un < vn)
+ {
+ /* Swap u's and v's. */
+ MPN_SRCPTR_SWAP (up,un, vp,vn);
+ }
+ }
+
+ if (vn)
+ {
+ mpn_mul_basecase (ws, up, un, vp, vn);
+ if (l <= un + vn)
+ {
+ t += mpn_add_n (prodp, prodp, ws, l);
+ if (l != un + vn)
+ t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
+ }
+ else
+ {
+ c = mpn_add_n (prodp, prodp, ws, un + vn);
+ t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
+ }
+ }
+
+ TMP_FREE (marker);
+ }
+ return prodp[un + vn - 1];
+}
diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c
new file mode 100644
index 0000000000..1c36b5fb1f
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_1.c
@@ -0,0 +1,59 @@
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and
+ store the product in a second limb vector.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ mp_size_t s1_size;
+ register mp_limb_t s2_limb;
+{
+ register mp_limb_t cy_limb;
+ register mp_size_t j;
+ register mp_limb_t prod_high, prod_low;
+
+ /* The loop counter and index J goes from -S1_SIZE to -1. This way
+ the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ res_ptr -= j;
+
+ cy_limb = 0;
+ do
+ {
+ umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb) + prod_high;
+
+ res_ptr[j] = prod_low;
+ }
+ while (++j != 0);
+
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c
new file mode 100644
index 0000000000..00c06aa5c4
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_basecase.c
@@ -0,0 +1,87 @@
+/* mpn_mul_basecase -- Internal routine to multiply two natural numbers
+ of length m and n.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Handle simple cases with traditional multiplication.
+
+ This is the most critical code of multiplication. All multiplies rely on
+ this, both small and huge. Small ones arrive here immediately, huge ones
+ arrive here as this is the base case for Karatsuba's recursive algorithm. */
+
+void
+#if __STDC__
+mpn_mul_basecase (mp_ptr prodp,
+ mp_srcptr up, mp_size_t usize,
+ mp_srcptr vp, mp_size_t vsize)
+#else
+mpn_mul_basecase (prodp, up, usize, vp, vsize)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t usize;
+ mp_srcptr vp;
+ mp_size_t vsize;
+#endif
+{
+ /* We first multiply by the low order one or two limbs, as the result can
+ be stored, not added, to PROD. We also avoid a loop for zeroing this
+ way. */
+#if HAVE_NATIVE_mpn_mul_2
+ if (vsize >= 2)
+ {
+ prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]);
+ prodp += 2, vp += 2, vsize -= 2;
+ }
+ else
+ {
+ prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+ return;
+ }
+#else
+ prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+ prodp += 1, vp += 1, vsize -= 1;
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2
+ while (vsize >= 2)
+ {
+ prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]);
+ prodp += 2, vp += 2, vsize -= 2;
+ }
+ if (vsize != 0)
+ prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+#else
+ /* For each iteration in the loop, multiply U with one limb from V, and
+ add the result to PROD. */
+ while (vsize != 0)
+ {
+ prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+ prodp += 1, vp += 1, vsize -= 1;
+ }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c
new file mode 100644
index 0000000000..00fd6d72de
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_fft.c
@@ -0,0 +1,772 @@
+/* An implementation in GMP of Scho"nhage's fast multiplication algorithm
+ modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
+
+ THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
+ MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
+ INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
+ A FUTURE GNU MP RELEASE.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+/* References:
+
+ Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
+ Strassen, Computing 7, p. 281-292, 1971.
+
+ Asymptotically fast algorithms for the numerical multiplication
+ and division of polynomials with complex coefficients, by Arnold Scho"nhage,
+ Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
+
+ Tapes versus Pointers, a study in implementing fast algorithms,
+ by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
+
+ See also http://www.loria.fr/~zimmerma/bignum
+
+
+ Future:
+
+ K==2 isn't needed in the current uses of this code and the bits specific
+ for that could be dropped.
+
+ It might be possible to avoid a small number of MPN_COPYs by using a
+ rotating temporary or two.
+
+ Multiplications of unequal sized operands can be done with this code, but
+ it needs a tighter test for identifying squaring (same sizes as well as
+ same pointers). */
+
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+
+FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
+ FFT_MUL_TABLE,
+ FFT_SQR_TABLE
+};
+
+
+static void mpn_mul_fft_internal
+_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+ int k, int K,
+ mp_limb_t **Ap, mp_limb_t **Bp,
+ mp_limb_t *A, mp_limb_t *B,
+ mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
+ mp_limb_t *T, int rec));
+
+
+/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
+ sqr==0 if for a multiply, sqr==1 for a square */
+int
+#if __STDC__
+mpn_fft_best_k (mp_size_t n, int sqr)
+#else
+mpn_fft_best_k (n, sqr)
+ mp_size_t n;
+ int sqr;
+#endif
+{
+ mp_size_t t;
+ int i;
+
+ for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
+ if (n < mpn_fft_table[sqr][i])
+ return i + FFT_FIRST_K;
+
+ /* treat 4*last as one further entry */
+ if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
+ return i + FFT_FIRST_K;
+ else
+ return i + FFT_FIRST_K + 1;
+}
+
+
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
+ FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */
+
+mp_size_t
+#if __STDC__
+mpn_fft_next_size (mp_size_t pl, int k)
+#else
+mpn_fft_next_size (pl, k)
+ mp_size_t pl;
+ int k;
+#endif
+{
+ mp_size_t N, M;
+ int K;
+
+ /* if (k==0) k = mpn_fft_best_k (pl, sqr); */
+ N = pl*BITS_PER_MP_LIMB;
+ K = 1<<k;
+ if (N%K) N=(N/K+1)*K;
+ M = N/K;
+ if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
+ return (N/BITS_PER_MP_LIMB);
+}
+
+
+static void
+#if __STDC__
+mpn_fft_initl(int **l, int k)
+#else
+mpn_fft_initl(l, k)
+ int **l;
+ int k;
+#endif
+{
+ int i,j,K;
+
+ l[0][0] = 0;
+ for (i=1,K=2;i<=k;i++,K*=2) {
+ for (j=0;j<K/2;j++) {
+ l[i][j] = 2*l[i-1][j];
+ l[i][K/2+j] = 1+l[i][j];
+ }
+ }
+}
+
+
+/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
+#else
+mpn_fft_neg_modF(ap, n)
+ mp_limb_t *ap;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t c;
+
+ c = ap[n]+2;
+ mpn_com_n (ap, ap, n);
+ ap[n]=0; mpn_incr_u(ap, c);
+}
+
+
+/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_mul_2exp_modF(ap, e, n, tp)
+ mp_limb_t *ap;
+ int e;
+ mp_size_t n;
+ mp_limb_t *tp;
+#endif
+{
+ int d, sh, i; mp_limb_t cc;
+
+ d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
+ sh = d % BITS_PER_MP_LIMB;
+ if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
+ else MPN_COPY(tp, ap, n+1);
+ d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
+ if (d) {
+ /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
+ /* mpn_xor would be more efficient here */
+ for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
+ cc = 1-mpn_add_1(ap, ap, d, 1);
+ if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
+ else MPN_COPY(ap+d, tp, n-d);
+ if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
+ ap[n]=mpn_add_1(ap, ap, n, cc);
+ else ap[n]=0;
+ }
+ else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
+ ap[n]=mpn_add_1(ap, ap, n, 1);
+ }
+ if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
+}
+
+
+/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
+#else
+mpn_fft_add_modF (ap, bp, n)
+ mp_limb_t *ap,*bp;
+ int n;
+#endif
+{
+ mp_limb_t c;
+
+ c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
+ if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
+ ap[n]=c;
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+ N=n*BITS_PER_MP_LIMB
+ 2^omega is a primitive root mod 2^N+1
+ output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
+ mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
+mp_limb_t **Ap,*tp;
+mp_size_t K,omega,n,inc;
+int **ll;
+#endif
+{
+ if (K==2) {
+#ifdef ADDSUB
+ if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+ MPN_COPY(tp, Ap[0], n+1);
+ mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+ if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+ Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+ }
+ else {
+ int j, inc2=2*inc;
+ int *lk = *ll;
+ mp_limb_t *tmp;
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+ tmp = TMP_ALLOC_LIMBS (n+1);
+ mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
+ mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
+ /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+ A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+ for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
+ MPN_COPY(tp, Ap[inc], n+1);
+ mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[inc], Ap[0], n);
+ mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[0], tp, n);
+ }
+ TMP_FREE(marker);
+ }
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+ N=n*BITS_PER_MP_LIMB
+ 2^omega is a primitive root mod 2^N+1
+ output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
+ mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
+ mp_limb_t **Ap,**Bp,*tp;
+ mp_size_t K,omega,n,inc;
+ int **ll;
+#endif
+{
+ if (K==2) {
+#ifdef ADDSUB
+ if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+ MPN_COPY(tp, Ap[0], n+1);
+ mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+ if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+ Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+#ifdef ADDSUB
+ if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
+#else
+ MPN_COPY(tp, Bp[0], n+1);
+ mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
+ if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
+#endif
+ Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
+ }
+ else {
+ int j, inc2=2*inc;
+ int *lk=*ll;
+ mp_limb_t *tmp;
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+ tmp = TMP_ALLOC_LIMBS (n+1);
+ mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
+ mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
+ /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+ A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+ for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
+ MPN_COPY(tp, Ap[inc], n+1);
+ mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[inc], Ap[0], n);
+ mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[0], tp, n);
+ MPN_COPY(tp, Bp[inc], n+1);
+ mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
+ mpn_fft_add_modF(Bp[inc], Bp[0], n);
+ mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+ mpn_fft_add_modF(Bp[0], tp, n);
+ }
+ TMP_FREE(marker);
+ }
+}
+
+
+/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
+static void
+#if __STDC__
+mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K)
+#else
+mpn_fft_mul_modF_K(ap, bp, n, K)
+ mp_limb_t **ap, **bp;
+ mp_size_t n;
+ int K;
+#endif
+{
+ int i;
+ int sqr = (ap == bp);
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+
+ if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+ int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
+ int **_fft_l;
+ mp_limb_t **Ap,**Bp,*A,*B,*T;
+
+ k = mpn_fft_best_k (n, sqr);
+ K2 = 1<<k;
+ maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
+ M2 = n*BITS_PER_MP_LIMB/K2;
+ l = n/K2;
+ Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
+ nprime2 = Nprime2/BITS_PER_MP_LIMB;
+ Mp2 = Nprime2/K2;
+
+ Ap = TMP_ALLOC_MP_PTRS (K2);
+ Bp = TMP_ALLOC_MP_PTRS (K2);
+ A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
+ T = TMP_ALLOC_LIMBS (nprime2+1);
+ B = A + K2*(nprime2+1);
+ _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+ for (i=0;i<=k;i++)
+ _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+ mpn_fft_initl(_fft_l, k);
+
+ TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
+ n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
+
+ for (i=0;i<K;i++,ap++,bp++)
+ mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
+ l, Mp2, _fft_l, T, 1);
+ }
+ else {
+ mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
+ tp = TMP_ALLOC_LIMBS (n2);
+ tpn = tp+n;
+ TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n));
+ for (i=0;i<K;i++) {
+ a = *ap++; b=*bp++;
+ if (sqr)
+ mpn_sqr_n(tp, a, n);
+ else
+ mpn_mul_n(tp, b, a, n);
+ if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
+ if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
+ if (cc) {
+ cc = mpn_add_1(tp, tp, n2, cc);
+ ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
+ }
+ a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1);
+ }
+ }
+ TMP_FREE(marker);
+}
+
+
+/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
+ output: K*A[0] K*A[K-1] ... K*A[1] */
+
+static void
+#if __STDC__
+mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
+ mp_limb_t *tp)
+#else
+mpn_fft_fftinv(Ap,K,omega,n,tp)
+ mp_limb_t **Ap, *tp;
+ int K;
+ mp_size_t omega, n;
+#endif
+{
+ if (K==2) {
+#ifdef ADDSUB
+ if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
+#else
+ MPN_COPY(tp, Ap[0], n+1);
+ mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
+ if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
+#endif
+ Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
+ }
+ else {
+ int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp;
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+ tmp = TMP_ALLOC_LIMBS (n+1);
+ mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
+ mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
+ /* A[j] <- A[j] + omega^j A[j+K/2]
+ A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
+ for (j=0;j<K2;j++,Ap++,Bp++) {
+ MPN_COPY(tp, Bp[0], n+1);
+ mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
+ mpn_fft_add_modF(Bp[0], Ap[0], n);
+ mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
+ mpn_fft_add_modF(Ap[0], tp, n);
+ }
+ TMP_FREE(marker);
+ }
+}
+
+
+/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_div_2exp_modF(ap,k,n,tp)
+ mp_limb_t *ap,*tp;
+ int k;
+ mp_size_t n;
+#endif
+{
+ int i;
+
+ i = 2*n*BITS_PER_MP_LIMB;
+ i = (i-k) % i;
+ mpn_fft_mul_2exp_modF(ap,i,n,tp);
+ /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
+ /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
+ if (ap[n]==1) {
+ for (i=0;i<n && ap[i]==0;i++);
+ if (i<n) {
+ ap[n]=0;
+ mpn_sub_1(ap, ap, n, 1);
+ }
+ }
+}
+
+
+/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
+static void
+#if __STDC__
+mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an)
+#else
+mpn_fft_norm_modF(rp, ap, n, an)
+ mp_limb_t *rp;
+ mp_limb_t *ap;
+ mp_size_t n;
+ mp_size_t an;
+#endif
+{
+ mp_size_t l;
+
+ if (an>2*n) {
+ l = n;
+ rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an,
+ mpn_add_n(rp,ap,ap+2*n,an-2*n));
+ }
+ else {
+ l = an-n;
+ MPN_COPY(rp, ap, n);
+ rp[n]=0;
+ }
+ if (mpn_sub_n(rp,rp,ap+n,l)) {
+ if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
+ rp[n]=mpn_add_1(rp,rp,n,1);
+ }
+}
+
+
+static void
+#if __STDC__
+mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+ int k, int K,
+ mp_limb_t **Ap, mp_limb_t **Bp,
+ mp_limb_t *A, mp_limb_t *B,
+ mp_size_t nprime, mp_size_t l, mp_size_t Mp,
+ int **_fft_l,
+ mp_limb_t *T, int rec)
+#else
+mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
+ mp_limb_t *op;
+ mp_srcptr n, m;
+ mp_limb_t **Ap,**Bp,*A,*B,*T;
+ mp_size_t pl,nprime;
+ int **_fft_l;
+ int k,K,l,Mp,rec;
+#endif
+{
+ int i, sqr, pla, lo, sh, j;
+ mp_limb_t *p;
+
+ sqr = (n==m);
+
+ TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
+ pl,k,K,nprime,l,Mp,rec,sqr));
+
+ /* decomposition of inputs into arrays Ap[i] and Bp[i] */
+ if (rec) for (i=0;i<K;i++) {
+ Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+ /* store the next M bits of n into A[i] */
+ /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+ MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
+ /* set most significant bits of n and m (important in recursive calls) */
+ if (i==K-1) Ap[i][l]=n[0];
+ mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+ if (!sqr) {
+ MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
+ if (i==K-1) Bp[i][l]=m[0];
+ mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+ }
+ }
+
+ /* direct fft's */
+ if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
+ else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
+
+ /* term to term multiplications */
+ mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
+
+ /* inverse fft's */
+ mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
+
+ /* division of terms after inverse fft */
+ for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
+
+ /* addition of terms in result p */
+ MPN_ZERO(T,nprime+1);
+ pla = l*(K-1)+nprime+1; /* number of required limbs for p */
+ p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
+ MPN_ZERO(p, pla);
+ sqr=0; /* will accumulate the (signed) carry at p[pla] */
+ for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
+ mp_ptr n = p+sh;
+ j = (K-i)%K;
+ if (mpn_add_n(n,n,Ap[j],nprime+1))
+ sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
+ T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
+ if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
+ sqr -= mpn_sub_1(n,n,pla-sh,1);
+ sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
+ }
+ }
+ if (sqr==-1) {
+ if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
+ /* p[pla-pl]...p[pla-1] are all zero */
+ mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
+ mpn_sub_1(p+pla-1,p+pla-1,1,1);
+ }
+ }
+ else if (sqr==1) {
+ if (pla>=2*pl)
+ while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
+ else {
+ sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
+ ASSERT (sqr == 0);
+ }
+ }
+ else
+ ASSERT (sqr == 0);
+
+ /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
+ < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
+ < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
+ mpn_fft_norm_modF(op,p,pl,pla);
+}
+
+
+/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
+ n and m have respectively nl and ml limbs
+ op must have space for pl+1 limbs
+ One must have pl = mpn_fft_next_size(pl, k).
+*/
+
+void
+#if __STDC__
+mpn_mul_fft (mp_ptr op, mp_size_t pl,
+ mp_srcptr n, mp_size_t nl,
+ mp_srcptr m, mp_size_t ml,
+ int k)
+#else
+mpn_mul_fft (op, pl, n, nl, m, ml, k)
+ mp_ptr op;
+ mp_size_t pl;
+ mp_srcptr n;
+ mp_size_t nl;
+ mp_srcptr m;
+ mp_size_t ml;
+ int k;
+#endif
+{
+ int K,maxLK,i,j;
+ mp_size_t N,Nprime,nprime,M,Mp,l;
+ mp_limb_t **Ap,**Bp,*A,*T,*B;
+ int **_fft_l;
+ int sqr = (n==m && nl==ml);
+ TMP_DECL(marker);
+
+ TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
+ pl, nl, ml, k));
+ ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
+
+ TMP_MARK(marker);
+ N = pl*BITS_PER_MP_LIMB;
+ _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+ for (i=0;i<=k;i++)
+ _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+ mpn_fft_initl(_fft_l, k);
+ K = 1<<k;
+ M = N/K; /* N = 2^k M */
+ l = M/BITS_PER_MP_LIMB;
+ maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
+
+ Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
+ nprime = Nprime/BITS_PER_MP_LIMB;
+ TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
+ N, K, M, l, maxLK, Nprime, nprime));
+ if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+ maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
+ if (Nprime % maxLK) {
+ Nprime=((Nprime/maxLK)+1)*maxLK;
+ nprime = Nprime/BITS_PER_MP_LIMB;
+ }
+ TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
+ }
+
+ T = TMP_ALLOC_LIMBS (nprime+1);
+ Mp = Nprime/K;
+
+ TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
+ pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
+ printf(" temp space %ld\n", 2*K*(nprime+1)));
+
+ A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
+ B = A+K*(nprime+1);
+ Ap = TMP_ALLOC_MP_PTRS (K);
+ Bp = TMP_ALLOC_MP_PTRS (K);
+ /* special decomposition for main call */
+ for (i=0;i<K;i++) {
+ Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+ /* store the next M bits of n into A[i] */
+ /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+ if (nl>0) {
+ j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
+ MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
+ mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+ }
+ else MPN_ZERO(Ap[i], nprime+1);
+ nl -= l;
+ if (n!=m) {
+ if (ml>0) {
+ j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
+ MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
+ mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+ }
+ else MPN_ZERO(Bp[i], nprime+1);
+ }
+ ml -= l;
+ }
+ mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
+ TMP_FREE(marker);
+ _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
+}
+
+
+#if WANT_ASSERT
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+ mpn_zero_p (p, n)
+ mp_ptr p;
+ mp_size_t n;
+#endif
+{
+ mp_size_t i;
+
+ for (i = 0; i < n; i++)
+ {
+ if (p[i] != 0)
+ return 0;
+ }
+
+ return 1;
+}
+#endif
+
+
+/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
+
+ FIXME: Duplicating the result like this is wasteful, do something better
+ perhaps at the norm_modF stage above. */
+
+void
+#if __STDC__
+mpn_mul_fft_full (mp_ptr op,
+ mp_srcptr n, mp_size_t nl,
+ mp_srcptr m, mp_size_t ml)
+#else
+mpn_mul_fft_full (op, n, nl, m, ml)
+ mp_ptr op;
+ mp_srcptr n;
+ mp_size_t nl;
+ mp_srcptr m;
+ mp_size_t ml;
+#endif
+{
+ mp_ptr pad_op;
+ mp_size_t pl;
+ int k;
+ int sqr = (n==m && nl==ml);
+
+ k = mpn_fft_best_k (nl+ml, sqr);
+ pl = mpn_fft_next_size (nl+ml, k);
+
+ TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
+ nl, ml, pl, k));
+
+ pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
+ mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
+
+ ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
+ MPN_COPY (op, pad_op, nl+ml);
+
+ _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
+}
diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c
new file mode 100644
index 0000000000..b7563be2d3
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_n.c
@@ -0,0 +1,1343 @@
+/* mpn_mul_n and helper function -- Multiply/square natural numbers.
+
+ THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
+ ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
+ THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
+ THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+ 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+#if !defined (__alpha) && !defined (__mips)
+/* For all other machines, we want to call mpn functions for the compund
+ operations instead of open-coding them. */
+#define USE_MORE_MPN
+#endif
+
+/*== Function declarations =================================================*/
+
+static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
+ mp_ptr, mp_ptr, mp_ptr,
+ mp_srcptr, mp_srcptr, mp_srcptr,
+ mp_size_t, mp_size_t));
+static void interpolate3 _PROTO ((mp_srcptr,
+ mp_ptr, mp_ptr, mp_ptr,
+ mp_srcptr,
+ mp_ptr, mp_ptr, mp_ptr,
+ mp_size_t, mp_size_t));
+static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+
+
+/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
+
+/* Multiplies using 3 half-sized mults and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+void
+#if __STDC__
+mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_mul_n(p, a, b, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_srcptr b;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t i, sign, w, w0, w1;
+ mp_size_t n2;
+ mp_srcptr x, y;
+
+ n2 = n >> 1;
+ ASSERT (n2 > 0);
+
+ if (n & 1)
+ {
+ /* Odd length. */
+ mp_size_t n1, n3, nm1;
+
+ n3 = n - n2;
+
+ sign = 0;
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p, x, y, n2);
+ }
+ p[n2] = w;
+
+ w = b[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p + n3, b, b + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = b[i];
+ w1 = b[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = b + n3;
+ y = b;
+ sign ^= 1;
+ }
+ else
+ {
+ x = b;
+ y = b + n3;
+ }
+ mpn_sub_n (p + n3, x, y, n2);
+ }
+ p[n] = w;
+
+ n1 = n + 1;
+ if (n2 < KARATSUBA_MUL_THRESHOLD)
+ {
+ if (n3 < KARATSUBA_MUL_THRESHOLD)
+ {
+ mpn_mul_basecase (ws, p, n3, p + n3, n3);
+ mpn_mul_basecase (p, a, n3, b, n3);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+ mpn_kara_mul_n (p, a, b, n3, ws + n1);
+ }
+ mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+ mpn_kara_mul_n (p, a, b, n3, ws + n1);
+ mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
+ }
+
+ if (sign)
+ mpn_add_n (ws, p, ws, n1);
+ else
+ mpn_sub_n (ws, p, ws, n1);
+
+ nm1 = n - 1;
+ if (mpn_add_n (ws, p + n1, ws, nm1))
+ {
+ mp_limb_t x = ws[nm1] + 1;
+ ws[nm1] = x;
+ if (x == 0)
+ ++ws[n];
+ }
+ if (mpn_add_n (p + n3, p + n3, ws, n1))
+ {
+ mp_limb_t x;
+ i = n1 + n3;
+ do
+ {
+ x = p[i] + 1;
+ p[i] = x;
+ ++i;
+ } while (x == 0);
+ }
+ }
+ else
+ {
+ /* Even length. */
+ mp_limb_t t;
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ sign = 0;
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p, x, y, n2);
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = b[i];
+ w1 = b[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = b + n2;
+ y = b;
+ sign ^= 1;
+ }
+ else
+ {
+ x = b;
+ y = b + n2;
+ }
+ mpn_sub_n (p + n2, x, y, n2);
+
+ /* Pointwise products. */
+ if (n2 < KARATSUBA_MUL_THRESHOLD)
+ {
+ mpn_mul_basecase (ws, p, n2, p + n2, n2);
+ mpn_mul_basecase (p, a, n2, b, n2);
+ mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
+ mpn_kara_mul_n (p, a, b, n2, ws + n);
+ mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
+ }
+
+ /* Interpolate. */
+ if (sign)
+ w = mpn_add_n (ws, p, ws, n);
+ else
+ w = -mpn_sub_n (ws, p, ws, n);
+ w += mpn_add_n (ws, p + n, ws, n);
+ w += mpn_add_n (p + n2, p + n2, ws, n);
+ /* TO DO: could put "if (w) { ... }" here.
+ * Less work but badly predicted branch.
+ * No measurable difference in speed on Alpha.
+ */
+ i = n + n2;
+ t = p[i] + w;
+ p[i] = t;
+ if (t < w)
+ {
+ do
+ {
+ ++i;
+ w = p[i] + 1;
+ p[i] = w;
+ }
+ while (w == 0);
+ }
+ }
+}
+
+void
+#if __STDC__
+mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_sqr_n (p, a, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t i, sign, w, w0, w1;
+ mp_size_t n2;
+ mp_srcptr x, y;
+
+ n2 = n >> 1;
+ ASSERT (n2 > 0);
+
+ if (n & 1)
+ {
+ /* Odd length. */
+ mp_size_t n1, n3, nm1;
+
+ n3 = n - n2;
+
+ sign = 0;
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p, x, y, n2);
+ }
+ p[n2] = w;
+
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p + n3, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign ^= 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p + n3, x, y, n2);
+ }
+ p[n] = w;
+
+ n1 = n + 1;
+ if (n2 < KARATSUBA_SQR_THRESHOLD)
+ {
+ if (n3 < KARATSUBA_SQR_THRESHOLD)
+ {
+ mpn_sqr_basecase (ws, p, n3);
+ mpn_sqr_basecase (p, a, n3);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n3, ws + n1);
+ mpn_kara_sqr_n (p, a, n3, ws + n1);
+ }
+ mpn_sqr_basecase (p + n1, a + n3, n2);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n3, ws + n1);
+ mpn_kara_sqr_n (p, a, n3, ws + n1);
+ mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
+ }
+
+ if (sign)
+ mpn_add_n (ws, p, ws, n1);
+ else
+ mpn_sub_n (ws, p, ws, n1);
+
+ nm1 = n - 1;
+ if (mpn_add_n (ws, p + n1, ws, nm1))
+ {
+ mp_limb_t x = ws[nm1] + 1;
+ ws[nm1] = x;
+ if (x == 0)
+ ++ws[n];
+ }
+ if (mpn_add_n (p + n3, p + n3, ws, n1))
+ {
+ mp_limb_t x;
+ i = n1 + n3;
+ do
+ {
+ x = p[i] + 1;
+ p[i] = x;
+ ++i;
+ } while (x == 0);
+ }
+ }
+ else
+ {
+ /* Even length. */
+ mp_limb_t t;
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ sign = 0;
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p, x, y, n2);
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign ^= 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p + n2, x, y, n2);
+
+ /* Pointwise products. */
+ if (n2 < KARATSUBA_SQR_THRESHOLD)
+ {
+ mpn_sqr_basecase (ws, p, n2);
+ mpn_sqr_basecase (p, a, n2);
+ mpn_sqr_basecase (p + n, a + n2, n2);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n2, ws + n);
+ mpn_kara_sqr_n (p, a, n2, ws + n);
+ mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
+ }
+
+ /* Interpolate. */
+ if (sign)
+ w = mpn_add_n (ws, p, ws, n);
+ else
+ w = -mpn_sub_n (ws, p, ws, n);
+ w += mpn_add_n (ws, p + n, ws, n);
+ w += mpn_add_n (p + n2, p + n2, ws, n);
+ /* TO DO: could put "if (w) { ... }" here.
+ * Less work but badly predicted branch.
+ * No measurable difference in speed on Alpha.
+ */
+ i = n + n2;
+ t = p[i] + w;
+ p[i] = t;
+ if (t < w)
+ {
+ do
+ {
+ ++i;
+ w = p[i] + 1;
+ p[i] = w;
+ }
+ while (w == 0);
+ }
+ }
+}
+
+/*-- add2Times -------------------------------------------------------------*/
+
+/* z[] = x[] + 2 * y[]
+ Note that z and x might point to the same vectors. */
+#ifdef USE_MORE_MPN
+static inline mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+ mp_ptr z;
+ mp_srcptr x;
+ mp_srcptr y;
+ mp_size_t n;
+#endif
+{
+ mp_ptr t;
+ mp_limb_t c;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ c = mpn_lshift (t, y, n, 1);
+ c += mpn_add_n (z, x, t, n);
+ TMP_FREE (marker);
+ return c;
+}
+#else
+
+static mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+ mp_ptr z;
+ mp_srcptr x;
+ mp_srcptr y;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t c, v, w;
+
+ ASSERT (n > 0);
+ v = *x; w = *y;
+ c = w >> (BITS_PER_MP_LIMB - 1);
+ w <<= 1;
+ v += w;
+ c += v < w;
+ *z = v;
+ ++x; ++y; ++z;
+ while (--n)
+ {
+ v = *x;
+ w = *y;
+ v += c;
+ c = v < c;
+ c += w >> (BITS_PER_MP_LIMB - 1);
+ w <<= 1;
+ v += w;
+ c += v < w;
+ *z = v;
+ ++x; ++y; ++z;
+ }
+
+ return c;
+}
+#endif
+
+/*-- evaluate3 -------------------------------------------------------------*/
+
+/* Evaluates:
+ * ph := 4*A+2*B+C
+ * p1 := A+B+C
+ * p2 := A+2*B+4*C
+ * where:
+ * ph[], p1[], p2[], A[] and B[] all have length len,
+ * C[] has length len2 with len-len2 = 0, 1 or 2.
+ * Returns top words (overflow) at pth, pt1 and pt2 respectively.
+ */
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+ mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+ A, B, C, len, len2)
+ mp_ptr ph;
+ mp_ptr p1;
+ mp_ptr p2;
+ mp_ptr pth;
+ mp_ptr pt1;
+ mp_ptr pt2;
+ mp_srcptr A;
+ mp_srcptr B;
+ mp_srcptr C;
+ mp_size_t len;
+ mp_size_t len2;
+#endif
+{
+ mp_limb_t c, d, e;
+
+ ASSERT (len - len2 <= 2);
+
+ e = mpn_lshift (p1, B, len, 1);
+
+ c = mpn_lshift (ph, A, len, 2);
+ c += e + mpn_add_n (ph, ph, p1, len);
+ d = mpn_add_n (ph, ph, C, len2);
+ if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
+ ASSERT (c < 7);
+ *pth = c;
+
+ c = mpn_lshift (p2, C, len2, 2);
+#if 1
+ if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
+ c += e + mpn_add_n (p2, p2, p1, len);
+#else
+ d = mpn_add_n (p2, p2, p1, len2);
+ c += d;
+ if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
+ c += e;
+#endif
+ c += mpn_add_n (p2, p2, A, len);
+ ASSERT (c < 7);
+ *pt2 = c;
+
+ c = mpn_add_n (p1, A, B, len);
+ d = mpn_add_n (p1, p1, C, len2);
+ if (len2 == len) c += d;
+ else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
+ ASSERT (c < 3);
+ *pt1 = c;
+
+}
+
+#else
+
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+ mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+ A, B, C, l, ls)
+ mp_ptr ph;
+ mp_ptr p1;
+ mp_ptr p2;
+ mp_ptr pth;
+ mp_ptr pt1;
+ mp_ptr pt2;
+ mp_srcptr A;
+ mp_srcptr B;
+ mp_srcptr C;
+ mp_size_t l;
+ mp_size_t ls;
+#endif
+{
+ mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
+
+ ASSERT (l - ls <= 2);
+
+ th = t1 = t2 = 0;
+ for (i = 0; i < l; ++i)
+ {
+ a = *A;
+ b = *B;
+ c = i < ls ? *C : 0;
+
+ /* TO DO: choose one of the following alternatives. */
+#if 0
+ t = a << 2;
+ vh = th + t;
+ th = vh < t;
+ th += a >> (BITS_PER_MP_LIMB - 2);
+ t = b << 1;
+ vh += t;
+ th += vh < t;
+ th += b >> (BITS_PER_MP_LIMB - 1);
+ vh += c;
+ th += vh < c;
+#else
+ vh = th + c;
+ th = vh < c;
+ t = b << 1;
+ vh += t;
+ th += vh < t;
+ th += b >> (BITS_PER_MP_LIMB - 1);
+ t = a << 2;
+ vh += t;
+ th += vh < t;
+ th += a >> (BITS_PER_MP_LIMB - 2);
+#endif
+
+ v1 = t1 + a;
+ t1 = v1 < a;
+ v1 += b;
+ t1 += v1 < b;
+ v1 += c;
+ t1 += v1 < c;
+
+ v2 = t2 + a;
+ t2 = v2 < a;
+ t = b << 1;
+ v2 += t;
+ t2 += v2 < t;
+ t2 += b >> (BITS_PER_MP_LIMB - 1);
+ t = c << 2;
+ v2 += t;
+ t2 += v2 < t;
+ t2 += c >> (BITS_PER_MP_LIMB - 2);
+
+ *ph = vh;
+ *p1 = v1;
+ *p2 = v2;
+
+ ++A; ++B; ++C;
+ ++ph; ++p1; ++p2;
+ }
+
+ ASSERT (th < 7);
+ ASSERT (t1 < 3);
+ ASSERT (t2 < 7);
+
+ *pth = th;
+ *pt1 = t1;
+ *pt2 = t2;
+}
+#endif
+
+
+/*-- interpolate3 ----------------------------------------------------------*/
+
+/* Interpolates B, C, D (in-place) from:
+ * 16*A+8*B+4*C+2*D+E
+ * A+B+C+D+E
+ * A+2*B+4*C+8*D+16*E
+ * where:
+ * A[], B[], C[] and D[] all have length l,
+ * E[] has length ls with l-ls = 0, 2 or 4.
+ *
+ * Reads top words (from earlier overflow) from ptb, ptc and ptd,
+ * and returns new top words there.
+ */
+
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+ mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
+#else
+interpolate3 (A, B, C, D, E,
+ ptb, ptc, ptd, len, len2)
+ mp_srcptr A;
+ mp_ptr B;
+ mp_ptr C;
+ mp_ptr D;
+ mp_srcptr E;
+ mp_ptr ptb;
+ mp_ptr ptc;
+ mp_ptr ptd;
+ mp_size_t len;
+ mp_size_t len2;
+#endif
+{
+ mp_ptr ws;
+ mp_limb_t t, tb,tc,td;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
+
+ /* Let x1, x2, x3 be the values to interpolate. We have:
+ * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
+ * c = a + x1 + x2 + x3 + e
+ * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
+ */
+
+ ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
+
+ tb = *ptb; tc = *ptc; td = *ptd;
+
+
+ /* b := b - 16*a - e
+ * c := c - a - e
+ * d := d - a - 16*e
+ */
+
+ t = mpn_lshift (ws, A, len, 4);
+ tb -= t + mpn_sub_n (B, B, ws, len);
+ t = mpn_sub_n (B, B, E, len2);
+ if (len2 == len) tb -= t;
+ else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
+
+ tc -= mpn_sub_n (C, C, A, len);
+ t = mpn_sub_n (C, C, E, len2);
+ if (len2 == len) tc -= t;
+ else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
+
+ t = mpn_lshift (ws, E, len2, 4);
+ t += mpn_add_n (ws, ws, A, len2);
+#if 1
+ if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
+ td -= t + mpn_sub_n (D, D, ws, len);
+#else
+ t += mpn_sub_n (D, D, ws, len2);
+ if (len2 != len) {
+ t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
+ t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
+ } /* end if/else */
+ td -= t;
+#endif
+
+
+ /* b, d := b + d, b - d */
+
+#ifdef HAVE_MPN_ADD_SUB_N
+ /* #error TO DO ... */
+#else
+ t = tb + td + mpn_add_n (ws, B, D, len);
+ td = tb - td - mpn_sub_n (D, B, D, len);
+ tb = t;
+ MPN_COPY (B, ws, len);
+#endif
+
+ /* b := b-8*c */
+ t = 8 * tc + mpn_lshift (ws, C, len, 3);
+ tb -= t + mpn_sub_n (B, B, ws, len);
+
+ /* c := 2*c - b */
+ tc = 2 * tc + mpn_lshift (C, C, len, 1);
+ tc -= tb + mpn_sub_n (C, C, B, len);
+
+ /* d := d/3 */
+ td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
+
+ /* b, d := b + d, b - d */
+#ifdef HAVE_MPN_ADD_SUB_N
+ /* #error TO DO ... */
+#else
+ t = tb + td + mpn_add_n (ws, B, D, len);
+ td = tb - td - mpn_sub_n (D, B, D, len);
+ tb = t;
+ MPN_COPY (B, ws, len);
+#endif
+
+ /* Now:
+ * b = 4*x1
+ * c = 2*x2
+ * d = 4*x3
+ */
+
+ ASSERT(!(*B & 3));
+ mpn_rshift (B, B, len, 2);
+ B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
+ ASSERT((long)tb >= 0);
+ tb >>= 2;
+
+ ASSERT(!(*C & 1));
+ mpn_rshift (C, C, len, 1);
+ C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
+ ASSERT((long)tc >= 0);
+ tc >>= 1;
+
+ ASSERT(!(*D & 3));
+ mpn_rshift (D, D, len, 2);
+ D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
+ ASSERT((long)td >= 0);
+ td >>= 2;
+
+#if WANT_ASSERT
+ ASSERT (tb < 2);
+ if (len == len2)
+ {
+ ASSERT (tc < 3);
+ ASSERT (td < 2);
+ }
+ else
+ {
+ ASSERT (tc < 2);
+ ASSERT (!td);
+ }
+#endif
+
+ *ptb = tb;
+ *ptc = tc;
+ *ptd = td;
+
+ TMP_FREE (marker);
+}
+
+#else
+
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+ mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
+#else
+interpolate3 (A, B, C, D, E,
+ ptb, ptc, ptd, l, ls)
+ mp_srcptr A;
+ mp_ptr B;
+ mp_ptr C;
+ mp_ptr D;
+ mp_srcptr E;
+ mp_ptr ptb;
+ mp_ptr ptc;
+ mp_ptr ptd;
+ mp_size_t l;
+ mp_size_t ls;
+#endif
+{
+ mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
+ const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
+
+#if WANT_ASSERT
+ t = l - ls;
+ ASSERT (t == 0 || t == 2 || t == 4);
+#endif
+
+ sb = sc = sd = 0;
+ for (i = 0; i < l; ++i)
+ {
+ mp_limb_t tb, tc, td, tt;
+
+ a = *A;
+ b = *B;
+ c = *C;
+ d = *D;
+ e = i < ls ? *E : 0;
+
+ /* Let x1, x2, x3 be the values to interpolate. We have:
+ * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
+ * c = a + x1 + x2 + x3 + e
+ * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
+ */
+
+ /* b := b - 16*a - e
+ * c := c - a - e
+ * d := d - a - 16*e
+ */
+ t = a << 4;
+ tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
+ b -= t;
+ tb -= b < e;
+ b -= e;
+ tc = -(c < a);
+ c -= a;
+ tc -= c < e;
+ c -= e;
+ td = -(d < a);
+ d -= a;
+ t = e << 4;
+ td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
+ d -= t;
+
+ /* b, d := b + d, b - d */
+ t = b + d;
+ tt = tb + td + (t < b);
+ td = tb - td - (b < d);
+ d = b - d;
+ b = t;
+ tb = tt;
+
+ /* b := b-8*c */
+ t = c << 3;
+ tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
+ b -= t;
+
+ /* c := 2*c - b */
+ t = c << 1;
+ tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
+ c = t - b;
+
+ /* d := d/3 */
+ d *= INVERSE_3;
+ td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
+ td *= INVERSE_3;
+
+ /* b, d := b + d, b - d */
+ t = b + d;
+ tt = tb + td + (t < b);
+ td = tb - td - (b < d);
+ d = b - d;
+ b = t;
+ tb = tt;
+
+ /* Now:
+ * b = 4*x1
+ * c = 2*x2
+ * d = 4*x3
+ */
+
+ /* sb has period 2. */
+ b += sb;
+ tb += b < sb;
+ sb &= maskOffHalf;
+ sb |= sb >> (BITS_PER_MP_LIMB >> 1);
+ sb += tb;
+
+ /* sc has period 1. */
+ c += sc;
+ tc += c < sc;
+ /* TO DO: choose one of the following alternatives. */
+#if 1
+ sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
+ sc += tc;
+#else
+ sc = tc - ((long)sc < 0L);
+#endif
+
+ /* sd has period 2. */
+ d += sd;
+ td += d < sd;
+ sd &= maskOffHalf;
+ sd |= sd >> (BITS_PER_MP_LIMB >> 1);
+ sd += td;
+
+ if (i != 0)
+ {
+ B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+ C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+ D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+ }
+ ob = b >> 2;
+ oc = c >> 1;
+ od = d >> 2;
+
+ ++A; ++B; ++C; ++D; ++E;
+ }
+
+ /* Handle top words. */
+ b = *ptb;
+ c = *ptc;
+ d = *ptd;
+
+ t = b + d;
+ d = b - d;
+ b = t;
+ b -= c << 3;
+ c = (c << 1) - b;
+ d *= INVERSE_3;
+ t = b + d;
+ d = b - d;
+ b = t;
+
+ b += sb;
+ c += sc;
+ d += sd;
+
+ B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+ C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+ D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+
+ b >>= 2;
+ c >>= 1;
+ d >>= 2;
+
+#if WANT_ASSERT
+ ASSERT (b < 2);
+ if (l == ls)
+ {
+ ASSERT (c < 3);
+ ASSERT (d < 2);
+ }
+ else
+ {
+ ASSERT (c < 2);
+ ASSERT (!d);
+ }
+#endif
+
+ *ptb = b;
+ *ptc = c;
+ *ptd = d;
+}
+#endif
+
+
+/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
+
+/* Multiplies using 5 mults of one third size and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
+ * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
+ * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
+ */
+
+#define TOOM3_MUL_REC(p, a, b, n, ws) \
+ do { \
+ if (n < KARATSUBA_MUL_THRESHOLD) \
+ mpn_mul_basecase (p, a, n, b, n); \
+ else if (n < TOOM3_MUL_THRESHOLD) \
+ mpn_kara_mul_n (p, a, b, n, ws); \
+ else \
+ mpn_toom3_mul_n (p, a, b, n, ws); \
+ } while (0)
+
+void
+#if __STDC__
+mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_mul_n (p, a, b, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_srcptr b;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
+ mp_limb_t *A,*B,*C,*D,*E, *W;
+ mp_size_t l,l2,l3,l4,l5,ls;
+
+ /* Break n words into chunks of size l, l and ls.
+ * n = 3*k => l = k, ls = k
+ * n = 3*k+1 => l = k+1, ls = k-1
+ * n = 3*k+2 => l = k+1, ls = k
+ */
+ {
+ mp_limb_t m;
+
+ ASSERT (n >= TOOM3_MUL_THRESHOLD);
+ l = ls = n / 3;
+ m = n - l * 3;
+ if (m != 0)
+ ++l;
+ if (m == 1)
+ --ls;
+
+ l2 = l * 2;
+ l3 = l * 3;
+ l4 = l * 4;
+ l5 = l * 5;
+ A = p;
+ B = ws;
+ C = p + l2;
+ D = ws + l2;
+ E = p + l4;
+ W = ws + l4;
+ }
+
+ /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+ evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
+
+ /** Second stage: pointwise multiplies. **/
+ TOOM3_MUL_REC(D, C, C + l, l, W);
+ tD = cD*dD;
+ if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
+ if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
+ ASSERT (tD < 49);
+ TOOM3_MUL_REC(C, B, B + l, l, W);
+ tC = cC*dC;
+ /* TO DO: choose one of the following alternatives. */
+#if 0
+ if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
+ if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
+#else
+ if (cC)
+ {
+ if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
+ else tC += add2Times (C + l, C + l, B + l, l);
+ }
+ if (dC)
+ {
+ if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
+ else tC += add2Times (C + l, C + l, B, l);
+ }
+#endif
+ ASSERT (tC < 9);
+ TOOM3_MUL_REC(B, A, A + l, l, W);
+ tB = cB*dB;
+ if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
+ if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
+ ASSERT (tB < 49);
+ TOOM3_MUL_REC(A, a, b, l, W);
+ TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
+
+ /** Third stage: interpolation. **/
+ interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+ /** Final stage: add up the coefficients. **/
+ {
+ mp_limb_t i, x, y;
+ tB += mpn_add_n (p + l, p + l, B, l2);
+ tD += mpn_add_n (p + l3, p + l3, D, l2);
+ mpn_incr_u (p + l3, tB);
+ mpn_incr_u (p + l4, tC);
+ mpn_incr_u (p + l5, tD);
+ }
+}
+
+/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
+
+/* Like previous function but for squaring */
+
+#define TOOM3_SQR_REC(p, a, n, ws) \
+ do { \
+ if (n < KARATSUBA_SQR_THRESHOLD) \
+ mpn_sqr_basecase (p, a, n); \
+ else if (n < TOOM3_SQR_THRESHOLD) \
+ mpn_kara_sqr_n (p, a, n, ws); \
+ else \
+ mpn_toom3_sqr_n (p, a, n, ws); \
+ } while (0)
+
+void
+#if __STDC__
+mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_sqr_n (p, a, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t cB,cC,cD, tB,tC,tD;
+ mp_limb_t *A,*B,*C,*D,*E, *W;
+ mp_size_t l,l2,l3,l4,l5,ls;
+
+ /* Break n words into chunks of size l, l and ls.
+ * n = 3*k => l = k, ls = k
+ * n = 3*k+1 => l = k+1, ls = k-1
+ * n = 3*k+2 => l = k+1, ls = k
+ */
+ {
+ mp_limb_t m;
+
+ ASSERT (n >= TOOM3_MUL_THRESHOLD);
+ l = ls = n / 3;
+ m = n - l * 3;
+ if (m != 0)
+ ++l;
+ if (m == 1)
+ --ls;
+
+ l2 = l * 2;
+ l3 = l * 3;
+ l4 = l * 4;
+ l5 = l * 5;
+ A = p;
+ B = ws;
+ C = p + l2;
+ D = ws + l2;
+ E = p + l4;
+ W = ws + l4;
+ }
+
+ /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+
+ /** Second stage: pointwise multiplies. **/
+ TOOM3_SQR_REC(D, C, l, W);
+ tD = cD*cD;
+ if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
+ ASSERT (tD < 49);
+ TOOM3_SQR_REC(C, B, l, W);
+ tC = cC*cC;
+ /* TO DO: choose one of the following alternatives. */
+#if 0
+ if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
+#else
+ if (cC >= 1)
+ {
+ tC += add2Times (C + l, C + l, B, l);
+ if (cC == 2)
+ tC += add2Times (C + l, C + l, B, l);
+ }
+#endif
+ ASSERT (tC < 9);
+ TOOM3_SQR_REC(B, A, l, W);
+ tB = cB*cB;
+ if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
+ ASSERT (tB < 49);
+ TOOM3_SQR_REC(A, a, l, W);
+ TOOM3_SQR_REC(E, a + l2, ls, W);
+
+ /** Third stage: interpolation. **/
+ interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+ /** Final stage: add up the coefficients. **/
+ {
+ mp_limb_t i, x, y;
+ tB += mpn_add_n (p + l, p + l, B, l2);
+ tD += mpn_add_n (p + l3, p + l3, D, l2);
+ mpn_incr_u (p + l3, tB);
+ mpn_incr_u (p + l4, tC);
+ mpn_incr_u (p + l5, tD);
+ }
+}
+
+void
+#if __STDC__
+mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
+#else
+mpn_mul_n (p, a, b, n)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_srcptr b;
+ mp_size_t n;
+#endif
+{
+ if (n < KARATSUBA_MUL_THRESHOLD)
+ mpn_mul_basecase (p, a, n, b, n);
+ else if (n < TOOM3_MUL_THRESHOLD)
+ {
+ /* Allocate workspace of fixed size on stack: fast! */
+#if TUNE_PROGRAM_BUILD
+ mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
+#else
+ mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
+#endif
+ mpn_kara_mul_n (p, a, b, n, ws);
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else if (n < FFT_MUL_THRESHOLD)
+#else
+ else
+#endif
+ {
+ /* Use workspace of unknown size in heap, as stack space may
+ * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the
+ * multiplication will take much longer than malloc()/free(). */
+ mp_limb_t wsLen, *ws;
+ wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
+ ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
+ mpn_toom3_mul_n (p, a, b, n, ws);
+ (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else
+ {
+ mpn_mul_fft_full (p, a, n, b, n);
+ }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c
new file mode 100644
index 0000000000..42ee3405d7
--- /dev/null
+++ b/rts/gmp/mpn/generic/perfsqr.c
@@ -0,0 +1,123 @@
+/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
+ zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue
+ modulo 0x100. */
+static unsigned char const sq_res_0x100[0x100] =
+{
+ 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+};
+
+int
+#if __STDC__
+mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
+#else
+mpn_perfect_square_p (up, usize)
+ mp_srcptr up;
+ mp_size_t usize;
+#endif
+{
+ mp_limb_t rem;
+ mp_ptr root_ptr;
+ int res;
+ TMP_DECL (marker);
+
+ /* The first test excludes 55/64 (85.9%) of the perfect square candidates
+ in O(1) time. */
+ if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0)
+ return 0;
+
+#if defined (PP)
+ /* The second test excludes 30652543/30808063 (99.5%) of the remaining
+ perfect square candidates in O(n) time. */
+
+ /* Firstly, compute REM = A mod PP. */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6))
+ rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+ else
+ rem = mpn_mod_1 (up, usize, (mp_limb_t) PP);
+
+ /* Now decide if REM is a quadratic residue modulo the factors in PP. */
+
+ /* If A is just a few limbs, computing the square root does not take long
+ time, so things might run faster if we limit this loop according to the
+ size of A. */
+
+#if BITS_PER_MP_LIMB == 64
+ if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0)
+ return 0;
+#endif
+ if (((0x13D122F3L >> rem % 29) & 1) == 0)
+ return 0;
+ if (((0x5335FL >> rem % 23) & 1) == 0)
+ return 0;
+ if (((0x30AF3L >> rem % 19) & 1) == 0)
+ return 0;
+ if (((0x1A317L >> rem % 17) & 1) == 0)
+ return 0;
+ if (((0x161BL >> rem % 13) & 1) == 0)
+ return 0;
+ if (((0x23BL >> rem % 11) & 1) == 0)
+ return 0;
+ if (((0x017L >> rem % 7) & 1) == 0)
+ return 0;
+ if (((0x13L >> rem % 5) & 1) == 0)
+ return 0;
+ if (((0x3L >> rem % 3) & 1) == 0)
+ return 0;
+#endif
+
+ TMP_MARK (marker);
+
+ /* For the third and last test, we finally compute the square root,
+ to make sure we've really got a perfect square. */
+ root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB);
+
+ /* Iff mpn_sqrtrem returns zero, the square is perfect. */
+ res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
+ TMP_FREE (marker);
+ return res;
+}
diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c
new file mode 100644
index 0000000000..387be9536d
--- /dev/null
+++ b/rts/gmp/mpn/generic/popcount.c
@@ -0,0 +1,93 @@
+/* popcount.c
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+ implement the POPC instruction. Disable pattern for now. */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+ ({ \
+ DItype __res; \
+ asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \
+ __res; \
+ })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+ You have to figure out how this works, I won't tell you! */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+ mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+ /* We have to go into some trouble to define these constants.
+ (For mp_limb_t being `long long'.) */
+ mp_limb_t cnst;
+ cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+ x -= (x & cnst) >> 1;
+ cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+ x = ((x & ~cnst) >> 2) + (x & cnst);
+ cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+ x = ((x >> 4) + x) & cnst;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x);
+ x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+ x -= (x & 0xaaaaaaaa) >> 1;
+ x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+ x = ((x >> 4) + x) & 0x0f0f0f0fL;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x) & 0xff;
+#endif
+ return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_popcount (register mp_srcptr p, register mp_size_t size)
+#else
+mpn_popcount (p, size)
+ register mp_srcptr p;
+ register mp_size_t size;
+#endif
+{
+ unsigned long int popcnt;
+ mp_size_t i;
+
+ popcnt = 0;
+ for (i = 0; i < size; i++)
+ popcnt += popc_limb (p[i]);
+
+ return popcnt;
+}
diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c
new file mode 100644
index 0000000000..27179683b3
--- /dev/null
+++ b/rts/gmp/mpn/generic/pre_mod_1.c
@@ -0,0 +1,69 @@
+/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb,
+ divisor_limb_inverted) --
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB.
+ DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB +
+ - 2^BITS_PER_MP_LIMB.
+ Return the single-limb remainder.
+
+Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+ mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted)
+#else
+mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted)
+ mp_srcptr dividend_ptr;
+ mp_size_t dividend_size;
+ mp_limb_t divisor_limb;
+ mp_limb_t divisor_limb_inverted;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t n0, r;
+ int dummy;
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+}
diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c
new file mode 100644
index 0000000000..dea4e20e56
--- /dev/null
+++ b/rts/gmp/mpn/generic/random.c
@@ -0,0 +1,43 @@
+/* mpn_random -- Generate random numbers.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpn_random (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random (res_ptr, size)
+ mp_ptr res_ptr;
+ mp_size_t size;
+#endif
+{
+ mp_size_t i;
+
+ for (i = 0; i < size; i++)
+ res_ptr[i] = urandom ();
+
+ /* Make sure the most significant limb is non-zero. */
+ while (res_ptr[size - 1] == 0)
+ res_ptr[size - 1] = urandom ();
+}
diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c
new file mode 100644
index 0000000000..86682f81fa
--- /dev/null
+++ b/rts/gmp/mpn/generic/random2.c
@@ -0,0 +1,105 @@
+/* mpn_random2 -- Generate random numbers with relatively long strings
+ of ones and zeroes. Suitable for border testing.
+
+Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined (__hpux) || defined (__alpha) || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random(). DEC OSF/1 1.2 random() returns a double. */
+long mrand48 ();
+static inline long
+random ()
+{
+ return mrand48 ();
+}
+#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+random ()
+{
+ return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+
+#else
+long random ();
+#endif
+
+/* It's a bit tricky to get this right, so please test the code well
+ if you hack with it. Some early versions of the function produced
+ random numbers with the leading limb == 0, and some versions never
+ made the most significant bit set. */
+
+void
+#if __STDC__
+mpn_random2 (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random2 (res_ptr, size)
+ mp_ptr res_ptr;
+ mp_size_t size;
+#endif
+{
+ int n_bits;
+ int bit_pos;
+ mp_size_t limb_pos;
+ unsigned int ran;
+ mp_limb_t limb;
+
+ limb = 0;
+
+ /* Start off in a random bit position in the most significant limb. */
+ bit_pos = random () & (BITS_PER_MP_LIMB - 1);
+
+ /* Least significant bit of RAN chooses string of ones/string of zeroes.
+ Make most significant limb be non-zero by setting bit 0 of RAN. */
+ ran = random () | 1;
+
+ for (limb_pos = size - 1; limb_pos >= 0; )
+ {
+ n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1;
+ if ((ran & 1) != 0)
+ {
+ /* Generate a string of ones. */
+ if (n_bits >= bit_pos)
+ {
+ res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1);
+ bit_pos += BITS_PER_MP_LIMB;
+ limb = (~(mp_limb_t) 0) << (bit_pos - n_bits);
+ }
+ else
+ {
+ limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1);
+ }
+ }
+ else
+ {
+ /* Generate a string of zeroes. */
+ if (n_bits >= bit_pos)
+ {
+ res_ptr[limb_pos--] = limb;
+ limb = 0;
+ bit_pos += BITS_PER_MP_LIMB;
+ }
+ }
+ bit_pos -= n_bits;
+ ran = random ();
+ }
+}
diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c
new file mode 100644
index 0000000000..59caf73529
--- /dev/null
+++ b/rts/gmp/mpn/generic/rshift.c
@@ -0,0 +1,88 @@
+/* mpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+ and store the USIZE least significant limbs of the result at WP.
+ The bits shifted out to the right are returned.
+
+ Argument constraints:
+ 1. 0 < CNT < BITS_PER_MP_LIMB
+ 2. If the result is to be written over the input, WP must be <= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_rshift (register mp_ptr wp,
+ register mp_srcptr up, mp_size_t usize,
+ register unsigned int cnt)
+#else
+mpn_rshift (wp, up, usize, cnt)
+ register mp_ptr wp;
+ register mp_srcptr up;
+ mp_size_t usize;
+ register unsigned int cnt;
+#endif
+{
+ register mp_limb_t high_limb, low_limb;
+ register unsigned sh_1, sh_2;
+ register mp_size_t i;
+ mp_limb_t retval;
+
+#ifdef DEBUG
+ if (usize == 0 || cnt == 0)
+ abort ();
+#endif
+
+ sh_1 = cnt;
+
+#if 0
+ if (sh_1 == 0)
+ {
+ if (wp != up)
+ {
+ /* Copy from low end to high end, to allow specified input/output
+ overlapping. */
+ for (i = 0; i < usize; i++)
+ wp[i] = up[i];
+ }
+ return usize;
+ }
+#endif
+
+ wp -= 1;
+ sh_2 = BITS_PER_MP_LIMB - sh_1;
+ high_limb = up[0];
+ retval = high_limb << sh_2;
+ low_limb = high_limb;
+
+ for (i = 1; i < usize; i++)
+ {
+ high_limb = up[i];
+ wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+ low_limb = high_limb;
+ }
+ wp[i] = low_limb >> sh_1;
+
+ return retval;
+}
diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c
new file mode 100644
index 0000000000..a269e34f5f
--- /dev/null
+++ b/rts/gmp/mpn/generic/sb_divrem_mn.c
@@ -0,0 +1,201 @@
+/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
+ quotient.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+ INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+ IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+ FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+ the NSIZE-DSIZE least significant quotient limbs at QP
+ and the DSIZE long remainder at NP. If QEXTRA_LIMBS is
+ non-zero, generate that many fraction bits and append them after the
+ other quotient limbs.
+ Return the most significant limb of the quotient, this is always 0 or 1.
+
+ Preconditions:
+ 0. NSIZE >= DSIZE.
+ 1. The most significant bit of the divisor must be set.
+ 2. QP must either not overlap with the input operands at all, or
+ QP + DSIZE >= NP must hold true. (This means that it's
+ possible to put the quotient in the high part of NUM, right after the
+ remainder in NUM.
+ 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+ 4. DSIZE >= 2. */
+
+
+#define PREINVERT_VIABLE \
+ (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */)
+
+mp_limb_t
+#if __STDC__
+mpn_sb_divrem_mn (mp_ptr qp,
+ mp_ptr np, mp_size_t nsize,
+ mp_srcptr dp, mp_size_t dsize)
+#else
+mpn_sb_divrem_mn (qp, np, nsize, dp, dsize)
+ mp_ptr qp;
+ mp_ptr np;
+ mp_size_t nsize;
+ mp_srcptr dp;
+ mp_size_t dsize;
+#endif
+{
+ mp_limb_t most_significant_q_limb = 0;
+ mp_size_t i;
+ mp_limb_t dx, d1, n0;
+ mp_limb_t dxinv;
+ int have_preinv;
+
+ ASSERT_ALWAYS (dsize > 2);
+
+ np += nsize - dsize;
+ dx = dp[dsize - 1];
+ d1 = dp[dsize - 2];
+ n0 = np[dsize - 1];
+
+ if (n0 >= dx)
+ {
+ if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0)
+ {
+ mpn_sub_n (np, np, dp, dsize);
+ most_significant_q_limb = 1;
+ }
+ }
+
+ /* If multiplication is much faster than division, preinvert the
+ most significant divisor limb before entering the loop. */
+ if (PREINVERT_VIABLE)
+ {
+ have_preinv = 0;
+ if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME)
+ {
+ invert_limb (dxinv, dx);
+ have_preinv = 1;
+ }
+ }
+
+ for (i = nsize - dsize - 1; i >= 0; i--)
+ {
+ mp_limb_t q;
+ mp_limb_t nx;
+ mp_limb_t cy_limb;
+
+ nx = np[dsize - 1];
+ np--;
+
+ if (nx == dx)
+ {
+ /* This might over-estimate q, but it's probably not worth
+ the extra code here to find out. */
+ q = ~(mp_limb_t) 0;
+
+#if 1
+ cy_limb = mpn_submul_1 (np, dp, dsize, q);
+#else
+ /* This should be faster on many machines */
+ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize);
+ cy = mpn_add_n (np, np, dp, dsize);
+ np[dsize] += cy;
+#endif
+
+ if (nx != cy_limb)
+ {
+ mpn_add_n (np, np, dp, dsize);
+ q--;
+ }
+
+ qp[i] = q;
+ }
+ else
+ {
+ mp_limb_t rx, r1, r0, p1, p0;
+
+ /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register
+ usage when np[dsize-1] is used in an asm statement like
+ umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due
+ to registers being clobbered. gcc 2.95 i386 doesn't have the
+ problem. */
+ {
+ mp_limb_t workaround = np[dsize - 1];
+ if (PREINVERT_VIABLE && have_preinv)
+ udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
+ else
+ udiv_qrnnd (q, r1, nx, workaround, dx);
+ }
+ umul_ppmm (p1, p0, d1, q);
+
+ r0 = np[dsize - 2];
+ rx = 0;
+ if (r1 < p1 || (r1 == p1 && r0 < p0))
+ {
+ p1 -= p0 < d1;
+ p0 -= d1;
+ q--;
+ r1 += dx;
+ rx = r1 < dx;
+ }
+
+ p1 += r0 < p0; /* cannot carry! */
+ rx -= r1 < p1; /* may become 11..1 if q is still too large */
+ r1 -= p1;
+ r0 -= p0;
+
+ cy_limb = mpn_submul_1 (np, dp, dsize - 2, q);
+
+ {
+ mp_limb_t cy1, cy2;
+ cy1 = r0 < cy_limb;
+ r0 -= cy_limb;
+ cy2 = r1 < cy1;
+ r1 -= cy1;
+ np[dsize - 1] = r1;
+ np[dsize - 2] = r0;
+ if (cy2 != rx)
+ {
+ mpn_add_n (np, np, dp, dsize);
+ q--;
+ }
+ }
+ qp[i] = q;
+ }
+ }
+
+ /* ______ ______ ______
+ |__rx__|__r1__|__r0__| partial remainder
+ ______ ______
+ - |__p1__|__p0__| partial product to subtract
+ ______ ______
+ - |______|cylimb|
+
+ rx is -1, 0 or 1. If rx=1, then q is correct (it should match
+ carry out). If rx=-1 then q is too large. If rx=0, then q might
+ be too large, but it is most likely correct.
+ */
+
+ return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c
new file mode 100644
index 0000000000..96f05ce854
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan0.c
@@ -0,0 +1,62 @@
+/* mpn_scan0 -- Scan from a given bit position for the next clear bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+ 1. What if starting_bit is not within U? Caller's problem?
+ 2. Bit index should be 'unsigned'?
+
+ Argument constraints:
+ 1. U must sooner ot later have a limb with a clear bit.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan0 (register mp_srcptr up,
+ register unsigned long int starting_bit)
+#else
+mpn_scan0 (up, starting_bit)
+ register mp_srcptr up;
+ register unsigned long int starting_bit;
+#endif
+{
+ mp_size_t starting_word;
+ mp_limb_t alimb;
+ int cnt;
+ mp_srcptr p;
+
+ /* Start at the word implied by STARTING_BIT. */
+ starting_word = starting_bit / BITS_PER_MP_LIMB;
+ p = up + starting_word;
+ alimb = ~*p++;
+
+ /* Mask off any bits before STARTING_BIT in the first limb. */
+ alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+ while (alimb == 0)
+ alimb = ~*p++;
+
+ count_leading_zeros (cnt, alimb & -alimb);
+ return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c
new file mode 100644
index 0000000000..98e2e0dcc0
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan1.c
@@ -0,0 +1,62 @@
+/* mpn_scan1 -- Scan from a given bit position for the next set bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+ 1. What if starting_bit is not within U? Caller's problem?
+ 2. Bit index should be 'unsigned'?
+
+ Argument constraints:
+ 1. U must sooner ot later have a limb != 0.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan1 (register mp_srcptr up,
+ register unsigned long int starting_bit)
+#else
+mpn_scan1 (up, starting_bit)
+ register mp_srcptr up;
+ register unsigned long int starting_bit;
+#endif
+{
+ mp_size_t starting_word;
+ mp_limb_t alimb;
+ int cnt;
+ mp_srcptr p;
+
+ /* Start at the word implied by STARTING_BIT. */
+ starting_word = starting_bit / BITS_PER_MP_LIMB;
+ p = up + starting_word;
+ alimb = *p++;
+
+ /* Mask off any bits before STARTING_BIT in the first limb. */
+ alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+ while (alimb == 0)
+ alimb = *p++;
+
+ count_leading_zeros (cnt, alimb & -alimb);
+ return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c
new file mode 100644
index 0000000000..e6ccc92154
--- /dev/null
+++ b/rts/gmp/mpn/generic/set_str.c
@@ -0,0 +1,159 @@
+/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base)
+ -- Convert a STR_LEN long base BASE byte string pointed to by STR to a
+ limb vector pointed to by RES_PTR. Return the number of limbs in
+ RES_PTR.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_size_t
+#if __STDC__
+mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base)
+#else
+mpn_set_str (xp, str, str_len, base)
+ mp_ptr xp;
+ const unsigned char *str;
+ size_t str_len;
+ int base;
+#endif
+{
+ mp_size_t size;
+ mp_limb_t big_base;
+ int indigits_per_limb;
+ mp_limb_t res_digit;
+
+ big_base = __mp_bases[base].big_base;
+ indigits_per_limb = __mp_bases[base].chars_per_limb;
+
+/* size = str_len / indigits_per_limb + 1; */
+
+ size = 0;
+
+ if ((base & (base - 1)) == 0)
+ {
+ /* The base is a power of 2. Read the input string from
+ least to most significant character/digit. */
+
+ const unsigned char *s;
+ int next_bitpos;
+ int bits_per_indigit = big_base;
+
+ res_digit = 0;
+ next_bitpos = 0;
+
+ for (s = str + str_len - 1; s >= str; s--)
+ {
+ int inp_digit = *s;
+
+ res_digit |= (mp_limb_t) inp_digit << next_bitpos;
+ next_bitpos += bits_per_indigit;
+ if (next_bitpos >= BITS_PER_MP_LIMB)
+ {
+ xp[size++] = res_digit;
+ next_bitpos -= BITS_PER_MP_LIMB;
+ res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
+ }
+ }
+
+ if (res_digit != 0)
+ xp[size++] = res_digit;
+ }
+ else
+ {
+ /* General case. The base is not a power of 2. */
+
+ size_t i;
+ int j;
+ mp_limb_t cy_limb;
+
+ for (i = indigits_per_limb; i < str_len; i += indigits_per_limb)
+ {
+ res_digit = *str++;
+ if (base == 10)
+ { /* This is a common case.
+ Help the compiler to avoid multiplication. */
+ for (j = 1; j < indigits_per_limb; j++)
+ res_digit = res_digit * 10 + *str++;
+ }
+ else
+ {
+ for (j = 1; j < indigits_per_limb; j++)
+ res_digit = res_digit * base + *str++;
+ }
+
+ if (size == 0)
+ {
+ if (res_digit != 0)
+ {
+ xp[0] = res_digit;
+ size = 1;
+ }
+ }
+ else
+ {
+ cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+ cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+ if (cy_limb != 0)
+ xp[size++] = cy_limb;
+ }
+ }
+
+ big_base = base;
+ res_digit = *str++;
+ if (base == 10)
+ { /* This is a common case.
+ Help the compiler to avoid multiplication. */
+ for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+ {
+ res_digit = res_digit * 10 + *str++;
+ big_base *= 10;
+ }
+ }
+ else
+ {
+ for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+ {
+ res_digit = res_digit * base + *str++;
+ big_base *= base;
+ }
+ }
+
+ if (size == 0)
+ {
+ if (res_digit != 0)
+ {
+ xp[0] = res_digit;
+ size = 1;
+ }
+ }
+ else
+ {
+ cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+ cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+ if (cy_limb != 0)
+ xp[size++] = cy_limb;
+ }
+ }
+
+ return size;
+}
diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c
new file mode 100644
index 0000000000..760258a3e0
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqr_basecase.c
@@ -0,0 +1,83 @@
+/* mpn_sqr_basecase -- Internal routine to square two natural numbers
+ of length m and n.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
+#else
+mpn_sqr_basecase (prodp, up, n)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t n;
+#endif
+{
+ mp_size_t i;
+
+ {
+ /* N.B.! We need the superfluous indirection through argh to work around
+ a reloader bug in GCC 2.7.*. */
+ mp_limb_t x;
+ mp_limb_t argh;
+ x = up[0];
+ umul_ppmm (argh, prodp[0], x, x);
+ prodp[1] = argh;
+ }
+ if (n > 1)
+ {
+ mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD];
+ mp_ptr tp = tarr;
+ mp_limb_t cy;
+
+ /* must fit 2*n limbs in tarr */
+ ASSERT (n <= KARATSUBA_SQR_THRESHOLD);
+
+ cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+ tp[n - 1] = cy;
+ for (i = 2; i < n; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
+ tp[n + i - 2] = cy;
+ }
+ for (i = 1; i < n; i++)
+ {
+ mp_limb_t x;
+ x = up[i];
+ umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x);
+ }
+ {
+ mp_limb_t cy;
+ cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+ cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
+ prodp[2 * n - 1] += cy;
+ }
+ }
+}
diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c
new file mode 100644
index 0000000000..ee3b5144dd
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqrtrem.c
@@ -0,0 +1,509 @@
+/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+
+ Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
+ Write the remainder at REM_PTR, if REM_PTR != NULL.
+ Return the size of the remainder.
+ (The size of the root is always half of the size of the operand.)
+
+ OP_PTR and ROOT_PTR may not point to the same object.
+ OP_PTR and REM_PTR may point to the same object.
+
+ If REM_PTR is NULL, only the root is computed and the return value of
+ the function is 0 if OP is a perfect square, and *any* non-zero number
+ otherwise.
+
+Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* This code is just correct if "unsigned char" has at least 8 bits. It
+ doesn't help to use CHAR_BIT from limits.h, as the real problem is
+ the static arrays. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Square root algorithm:
+
+ 1. Shift OP (the input) to the left an even number of bits s.t. there
+ are an even number of words and either (or both) of the most
+ significant bits are set. This way, sqrt(OP) has exactly half as
+ many words as OP, and has its most significant bit set.
+
+ 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
+ This approximation is used for the first single-precision
+ iterations of Newton's method, yielding a full-word approximation
+ to sqrt(OP).
+
+ 3. Perform multiple-precision Newton iteration until we have the
+ exact result. Only about half of the input operand is used in
+ this calculation, as the square root is perfectly determinable
+ from just the higher half of a number. */
+
+/* Define this macro for IEEE P854 machines with a fast sqrt instruction. */
+#if defined __GNUC__ && ! defined __SOFT_FLOAT
+
+#if defined (__sparc__) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if defined (__HAVE_68881__)
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if defined (__hppa) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if 0
+#if defined (__i386__) || defined (__i486__)
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \
+ __sqrt_res; \
+ })
+#endif
+#endif
+
+#endif
+
+#ifndef SQRT
+
+/* Tables for initial approximation of the square root. These are
+ indexed with bits 1-8 of the operand for which the square root is
+ calculated, where bit 0 is the most significant non-zero bit. I.e.
+ the most significant one-bit is not used, since that per definition
+ is one. Likewise, the tables don't return the highest bit of the
+ result. That bit must be inserted by or:ing the returned value with
+ 0x100. This way, we get a 9-bit approximation from 8-bit tables! */
+
+/* Table to be used for operands with an even total number of bits.
+ (Exactly as in the decimal system there are similarities between the
+ square root of numbers with the same initial digits and an even
+ difference in the total number of digits. Consider the square root
+ of 1, 10, 100, 1000, ...) */
+static const unsigned char even_approx_tab[256] =
+{
+ 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
+ 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
+ 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
+ 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
+ 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
+ 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
+ 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
+ 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
+ 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
+ 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
+ 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
+ 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
+ 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
+ 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
+ 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
+ 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
+ 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
+ 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
+ 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
+ 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
+ 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
+ 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
+ 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
+ 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
+ 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
+ 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
+ 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
+ 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
+ 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
+ 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
+ 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
+};
+
+/* Table to be used for operands with an odd total number of bits.
+ (Further comments before previous table.) */
+static const unsigned char odd_approx_tab[256] =
+{
+ 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
+ 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
+ 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
+ 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
+ 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
+ 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
+ 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
+ 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
+ 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
+ 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
+ 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
+ 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
+ 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
+ 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
+ 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
+ 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
+ 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
+ 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
+ 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
+ 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
+ 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
+ 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
+ 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
+ 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
+ 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
+ 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
+ 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
+ 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
+ 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
+ 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
+ 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
+};
+#endif
+
+
+mp_size_t
+#if __STDC__
+mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
+#else
+mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+ mp_ptr root_ptr;
+ mp_ptr rem_ptr;
+ mp_srcptr op_ptr;
+ mp_size_t op_size;
+#endif
+{
+ /* R (root result) */
+ mp_ptr rp; /* Pointer to least significant word */
+ mp_size_t rsize; /* The size in words */
+
+ /* T (OP shifted to the left a.k.a. normalized) */
+ mp_ptr tp; /* Pointer to least significant word */
+ mp_size_t tsize; /* The size in words */
+ mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */
+ mp_limb_t t_high0, t_high1; /* The two most significant words */
+
+ /* TT (temporary for numerator/remainder) */
+ mp_ptr ttp; /* Pointer to least significant word */
+
+ /* X (temporary for quotient in main loop) */
+ mp_ptr xp; /* Pointer to least significant word */
+ mp_size_t xsize; /* The size in words */
+
+ unsigned cnt;
+ mp_limb_t initial_approx; /* Initially made approximation */
+ mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */
+ mp_size_t tmp;
+ mp_size_t i;
+
+ mp_limb_t cy_limb;
+ TMP_DECL (marker);
+
+ /* If OP is zero, both results are zero. */
+ if (op_size == 0)
+ return 0;
+
+ count_leading_zeros (cnt, op_ptr[op_size - 1]);
+ tsize = op_size;
+ if ((tsize & 1) != 0)
+ {
+ cnt += BITS_PER_MP_LIMB;
+ tsize++;
+ }
+
+ rsize = tsize / 2;
+ rp = root_ptr;
+
+ TMP_MARK (marker);
+
+ /* Shift OP an even number of bits into T, such that either the most or
+ the second most significant bit is set, and such that the number of
+ words in T becomes even. This way, the number of words in R=sqrt(OP)
+ is exactly half as many as in OP, and the most significant bit of R
+ is set.
+
+ Also, the initial approximation is simplified by this up-shifted OP.
+
+ Finally, the Newtonian iteration which is the main part of this
+ program performs division by R. The fast division routine expects
+ the divisor to be "normalized" in exactly the sense of having the
+ most significant bit set. */
+
+ tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+ if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
+ t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
+ (cnt & ~1) % BITS_PER_MP_LIMB);
+ else
+ MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
+
+ if (cnt >= BITS_PER_MP_LIMB)
+ tp[0] = 0;
+
+ t_high0 = tp[tsize - 1];
+ t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */
+
+/* Is there a fast sqrt instruction defined for this machine? */
+#ifdef SQRT
+ {
+ initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
+ /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
+ become incorrect due to overflow in the conversion from double to
+ mp_limb_t above. It will typically be zero in that case, but might be
+ a small number on some machines. The most significant bit of
+ INITIAL_APPROX should be set, so that bit is a good overflow
+ indication. */
+ if ((mp_limb_signed_t) initial_approx >= 0)
+ initial_approx = ~(mp_limb_t)0;
+ }
+#else
+ /* Get a 9 bit approximation from the tables. The tables expect to
+ be indexed with the 8 high bits right below the highest bit.
+ Also, the highest result bit is not returned by the tables, and
+ must be or:ed into the result. The scheme gives 9 bits of start
+ approximation with just 256-entry 8 bit tables. */
+
+ if ((cnt & 1) == 0)
+ {
+ /* The most significant bit of t_high0 is set. */
+ initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
+ initial_approx &= 0xff;
+ initial_approx = even_approx_tab[initial_approx];
+ }
+ else
+ {
+ /* The most significant bit of t_high0 is unset,
+ the second most significant is set. */
+ initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
+ initial_approx &= 0xff;
+ initial_approx = odd_approx_tab[initial_approx];
+ }
+ initial_approx |= 0x100;
+ initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
+
+ /* Perform small precision Newtonian iterations to get a full word
+ approximation. For small operands, these iterations will do the
+ entire job. */
+ if (t_high0 == ~(mp_limb_t)0)
+ initial_approx = t_high0;
+ else
+ {
+ mp_limb_t quot;
+
+ if (t_high0 >= initial_approx)
+ initial_approx = t_high0 + 1;
+
+ /* First get about 18 bits with pure C arithmetics. */
+ quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
+ initial_approx = (initial_approx + quot) / 2;
+ initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+
+ /* Now get a full word by one (or for > 36 bit machines) several
+ iterations. */
+ for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
+ {
+ mp_limb_t ignored_remainder;
+
+ udiv_qrnnd (quot, ignored_remainder,
+ t_high0, t_high1, initial_approx);
+ initial_approx = (initial_approx + quot) / 2;
+ initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+ }
+ }
+#endif
+
+ rp[0] = initial_approx;
+ rsize = 1;
+
+#ifdef SQRT_DEBUG
+ printf ("\n\nT = ");
+ mpn_dump (tp, tsize);
+#endif
+
+ if (tsize > 2)
+ {
+ /* Determine the successive precisions to use in the iteration. We
+ minimize the precisions, beginning with the highest (i.e. last
+ iteration) to the lowest (i.e. first iteration). */
+
+ xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+ ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+ t_end_ptr = tp + tsize;
+
+ tmp = tsize / 2;
+ for (i = 0;; i++)
+ {
+ tsize = (tmp + 1) / 2;
+ if (tmp == tsize)
+ break;
+ tsizes[i] = tsize + tmp;
+ tmp = tsize;
+ }
+
+ /* Main Newton iteration loop. For big arguments, most of the
+ time is spent here. */
+
+ /* It is possible to do a great optimization here. The successive
+ divisors in the mpn_divmod call below have more and more leading
+ words equal to its predecessor. Therefore the beginning of
+ each division will repeat the same work as did the last
+ division. If we could guarantee that the leading words of two
+ consecutive divisors are the same (i.e. in this case, a later
+ divisor has just more digits at the end) it would be a simple
+ matter of just using the old remainder of the last division in
+ a subsequent division, to take care of this optimization. This
+ idea would surely make a difference even for small arguments. */
+
+ /* Loop invariants:
+
+ R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
+ X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
+ R <= shiftdown_to_same_size(X). */
+
+ while (--i >= 0)
+ {
+ mp_limb_t cy;
+#ifdef SQRT_DEBUG
+ mp_limb_t old_least_sign_r = rp[0];
+ mp_size_t old_rsize = rsize;
+
+ printf ("R = ");
+ mpn_dump (rp, rsize);
+#endif
+ tsize = tsizes[i];
+
+ /* Need to copy the numerator into temporary space, as
+ mpn_divmod overwrites its numerator argument with the
+ remainder (which we currently ignore). */
+ MPN_COPY (ttp, t_end_ptr - tsize, tsize);
+ cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
+ xsize = tsize - rsize;
+
+#ifdef SQRT_DEBUG
+ printf ("X =%d ", cy);
+ mpn_dump (xp, xsize);
+#endif
+
+ /* Add X and R with the most significant limbs aligned,
+ temporarily ignoring at least one limb at the low end of X. */
+ tmp = xsize - rsize;
+ cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
+
+ /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
+ intermediate roots that'd need an extra bit. We don't want to
+ handle that since it would make the subsequent divisor
+ non-normalized, so round such roots down to be only ones in the
+ current precision. */
+ if (cy == 2)
+ {
+ mp_size_t j;
+ for (j = xsize; j >= 0; j--)
+ xp[j] = ~(mp_limb_t)0;
+ }
+
+ /* Divide X by 2 and put the result in R. This is the new
+ approximation. Shift in the carry from the addition. */
+ mpn_rshift (rp, xp, xsize, 1);
+ rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+ rsize = xsize;
+#ifdef SQRT_DEBUG
+ if (old_least_sign_r != rp[rsize - old_rsize])
+ printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
+ i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
+ 2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
+#endif
+ }
+ }
+
+#ifdef SQRT_DEBUG
+ printf ("(final) R = ");
+ mpn_dump (rp, rsize);
+#endif
+
+ /* We computed the square root of OP * 2**(2*floor(cnt/2)).
+ This has resulted in R being 2**floor(cnt/2) to large.
+ Shift it down here to fix that. */
+ if (cnt / 2 != 0)
+ {
+ mpn_rshift (rp, rp, rsize, cnt/2);
+ rsize -= rp[rsize - 1] == 0;
+ }
+
+ /* Calculate the remainder. */
+ mpn_mul_n (tp, rp, rp, rsize);
+ tsize = rsize + rsize;
+ tsize -= tp[tsize - 1] == 0;
+ if (op_size < tsize
+ || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
+ {
+ /* R is too large. Decrement it. */
+
+ /* These operations can't overflow. */
+ cy_limb = mpn_sub_n (tp, tp, rp, rsize);
+ cy_limb += mpn_sub_n (tp, tp, rp, rsize);
+ mpn_decr_u (tp + rsize, cy_limb);
+ mpn_incr_u (tp, (mp_limb_t) 1);
+
+ mpn_decr_u (rp, (mp_limb_t) 1);
+
+#ifdef SQRT_DEBUG
+ printf ("(adjusted) R = ");
+ mpn_dump (rp, rsize);
+#endif
+ }
+
+ if (rem_ptr != NULL)
+ {
+ cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
+ MPN_NORMALIZE (rem_ptr, op_size);
+ TMP_FREE (marker);
+ return op_size;
+ }
+ else
+ {
+ int res;
+ res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
+ TMP_FREE (marker);
+ return res;
+ }
+}
diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c
new file mode 100644
index 0000000000..4f2f06099c
--- /dev/null
+++ b/rts/gmp/mpn/generic/sub_n.c
@@ -0,0 +1,62 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ register mp_srcptr s2_ptr;
+ mp_size_t size;
+#endif
+{
+ register mp_limb_t x, y, cy;
+ register mp_size_t j;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ s2_ptr -= j;
+ res_ptr -= j;
+
+ cy = 0;
+ do
+ {
+ y = s2_ptr[j];
+ x = s1_ptr[j];
+ y += cy; /* add previous carry to subtrahend */
+ cy = (y < cy); /* get out carry from that addition */
+ y = x - y; /* main subtract */
+ cy = (y > x) + cy; /* get out carry from the subtract, combine */
+ res_ptr[j] = y;
+ }
+ while (++j != 0);
+
+ return cy;
+}
diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c
new file mode 100644
index 0000000000..c7c08ee4af
--- /dev/null
+++ b/rts/gmp/mpn/generic/submul_1.c
@@ -0,0 +1,65 @@
+/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+ by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
+ from the limb vector pointed to by RES_PTR. Return the most significant
+ limb of the product, adjusted for carry-out from the subtraction.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ mp_size_t s1_size;
+ register mp_limb_t s2_limb;
+{
+ register mp_limb_t cy_limb;
+ register mp_size_t j;
+ register mp_limb_t prod_high, prod_low;
+ register mp_limb_t x;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ res_ptr -= j;
+ s1_ptr -= j;
+
+ cy_limb = 0;
+ do
+ {
+ umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb) + prod_high;
+
+ x = res_ptr[j];
+ prod_low = x - prod_low;
+ cy_limb += (prod_low > x);
+ res_ptr[j] = prod_low;
+ }
+ while (++j != 0);
+
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c
new file mode 100644
index 0000000000..b748b5d810
--- /dev/null
+++ b/rts/gmp/mpn/generic/tdiv_qr.c
@@ -0,0 +1,401 @@
+/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
+ write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If
+ qxn is non-zero, generate that many fraction limbs and append them after the
+ other quotient limbs, and update the remainder accordningly. The input
+ operands are unaffected.
+
+ Preconditions:
+ 1. The most significant limb of of the divisor must be non-zero.
+ 2. No argument overlap is permitted. (??? relax this ???)
+ 3. nn >= dn, even if qxn is non-zero. (??? relax this ???)
+
+ The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
+ complexity of multiplication.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+/* Extract the middle limb from ((h,,l) << cnt) */
+#define SHL(h,l,cnt) \
+ ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1))))
+
+void
+#if __STDC__
+mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+ mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+#else
+mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn)
+ mp_ptr qp;
+ mp_ptr rp;
+ mp_size_t qxn;
+ mp_srcptr np;
+ mp_size_t nn;
+ mp_srcptr dp;
+ mp_size_t dn;
+#endif
+{
+ /* FIXME:
+ 1. qxn
+ 2. pass allocated storage in additional parameter?
+ */
+ if (qxn != 0)
+ abort ();
+
+ switch (dn)
+ {
+ case 0:
+ DIVIDE_BY_ZERO;
+
+ case 1:
+ {
+ rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
+ return;
+ }
+
+ case 2:
+ {
+ int cnt;
+ mp_ptr n2p, d2p;
+ mp_limb_t qhl, cy;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ count_leading_zeros (cnt, dp[dn - 1]);
+ if (cnt != 0)
+ {
+ d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_lshift (d2p, dp, dn, cnt);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+ cy = mpn_lshift (n2p, np, nn, cnt);
+ n2p[nn] = cy;
+ qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
+ if (cy == 0)
+ qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */
+ }
+ else
+ {
+ d2p = (mp_ptr) dp;
+ n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
+ MPN_COPY (n2p, np, nn);
+ qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+ qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */
+ }
+
+ if (cnt != 0)
+ mpn_rshift (rp, n2p, dn, cnt);
+ else
+ MPN_COPY (rp, n2p, dn);
+ TMP_FREE (marker);
+ return;
+ }
+
+ default:
+ {
+ int adjust;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */
+ if (nn + adjust >= 2 * dn)
+ {
+ mp_ptr n2p, d2p;
+ mp_limb_t cy;
+ int cnt;
+ count_leading_zeros (cnt, dp[dn - 1]);
+
+ qp[nn - dn] = 0; /* zero high quotient limb */
+ if (cnt != 0) /* normalize divisor if needed */
+ {
+ d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_lshift (d2p, dp, dn, cnt);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+ cy = mpn_lshift (n2p, np, nn, cnt);
+ n2p[nn] = cy;
+ nn += adjust;
+ }
+ else
+ {
+ d2p = (mp_ptr) dp;
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+ MPN_COPY (n2p, np, nn);
+ n2p[nn] = 0;
+ nn += adjust;
+ }
+
+ if (dn == 2)
+ mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+ else if (dn < BZ_THRESHOLD)
+ mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+ else
+ {
+ /* Perform 2*dn / dn limb divisions as long as the limbs
+ in np last. */
+ mp_ptr q2p = qp + nn - 2 * dn;
+ n2p += nn - 2 * dn;
+ mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+ nn -= dn;
+ while (nn >= 2 * dn)
+ {
+ mp_limb_t c;
+ q2p -= dn; n2p -= dn;
+ c = mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+ ASSERT_ALWAYS (c == 0);
+ nn -= dn;
+ }
+
+ if (nn != dn)
+ {
+ n2p -= nn - dn;
+ /* In theory, we could fall out to the cute code below
+ since we now have exactly the situation that code
+ is designed to handle. We botch this badly and call
+ the basic mpn_sb_divrem_mn! */
+ if (dn == 2)
+ mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+ else
+ mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+ }
+ }
+
+
+ if (cnt != 0)
+ mpn_rshift (rp, n2p, dn, cnt);
+ else
+ MPN_COPY (rp, n2p, dn);
+ TMP_FREE (marker);
+ return;
+ }
+
+ /* When we come here, the numerator/partial remainder is less
+ than twice the size of the denominator. */
+
+ {
+ /* Problem:
+
+ Divide a numerator N with nn limbs by a denominator D with dn
+ limbs forming a quotient of nn-dn+1 limbs. When qn is small
+ compared to dn, conventional division algorithms perform poorly.
+ We want an algorithm that has an expected running time that is
+ dependent only on qn. It is assumed that the most significant
+ limb of the numerator is smaller than the most significant limb
+ of the denominator.
+
+ Algorithm (very informally stated):
+
+ 1) Divide the 2 x qn most significant limbs from the numerator
+ by the qn most significant limbs from the denominator. Call
+ the result qest. This is either the correct quotient, but
+ might be 1 or 2 too large. Compute the remainder from the
+ division. (This step is implemented by a mpn_divrem call.)
+
+ 2) Is the most significant limb from the remainder < p, where p
+ is the product of the most significant limb from the quotient
+ and the next(d). (Next(d) denotes the next ignored limb from
+ the denominator.) If it is, decrement qest, and adjust the
+ remainder accordingly.
+
+ 3) Is the remainder >= qest? If it is, qest is the desired
+ quotient. The algorithm terminates.
+
+ 4) Subtract qest x next(d) from the remainder. If there is
+ borrow out, decrement qest, and adjust the remainder
+ accordingly.
+
+ 5) Skip one word from the denominator (i.e., let next(d) denote
+ the next less significant limb. */
+
+ mp_size_t qn;
+ mp_ptr n2p, d2p;
+ mp_ptr tp;
+ mp_limb_t cy;
+ mp_size_t in, rn;
+ mp_limb_t quotient_too_large;
+ int cnt;
+
+ qn = nn - dn;
+ qp[qn] = 0; /* zero high quotient limb */
+ qn += adjust; /* qn cannot become bigger */
+
+ if (qn == 0)
+ {
+ MPN_COPY (rp, np, dn);
+ TMP_FREE (marker);
+ return;
+ }
+
+ in = dn - qn; /* (at least partially) ignored # of limbs in ops */
+ /* Normalize denominator by shifting it to the left such that its
+ most significant bit is set. Then shift the numerator the same
+ amount, to mathematically preserve quotient. */
+ count_leading_zeros (cnt, dp[dn - 1]);
+ if (cnt != 0)
+ {
+ d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
+
+ mpn_lshift (d2p, dp + in, qn, cnt);
+ d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt);
+
+ n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+ cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
+ if (adjust)
+ {
+ n2p[2 * qn] = cy;
+ n2p++;
+ }
+ else
+ {
+ n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt);
+ }
+ }
+ else
+ {
+ d2p = (mp_ptr) dp + in;
+
+ n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+ MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
+ if (adjust)
+ {
+ n2p[2 * qn] = 0;
+ n2p++;
+ }
+ }
+
+ /* Get an approximate quotient using the extracted operands. */
+ if (qn == 1)
+ {
+ mp_limb_t q0, r0;
+ mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
+ /* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
+ temps here. This doesn't hurt code quality on any machines
+ so we do it unconditionally. */
+ gcc272bug_n1 = n2p[1];
+ gcc272bug_n0 = n2p[0];
+ gcc272bug_d0 = d2p[0];
+ udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0);
+ n2p[0] = r0;
+ qp[0] = q0;
+ }
+ else if (qn == 2)
+ mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
+ else if (qn < BZ_THRESHOLD)
+ mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn);
+ else
+ mpn_bz_divrem_n (qp, n2p, d2p, qn);
+
+ rn = qn;
+ /* Multiply the first ignored divisor limb by the most significant
+ quotient limb. If that product is > the partial remainder's
+ most significant limb, we know the quotient is too large. This
+ test quickly catches most cases where the quotient is too large;
+ it catches all cases where the quotient is 2 too large. */
+ {
+ mp_limb_t dl, x;
+ mp_limb_t h, l;
+
+ if (in - 2 < 0)
+ dl = 0;
+ else
+ dl = dp[in - 2];
+
+ x = SHL (dp[in - 1], dl, cnt);
+ umul_ppmm (h, l, x, qp[qn - 1]);
+
+ if (n2p[qn - 1] < h)
+ {
+ mp_limb_t cy;
+
+ mpn_decr_u (qp, (mp_limb_t) 1);
+ cy = mpn_add_n (n2p, n2p, d2p, qn);
+ if (cy)
+ {
+ /* The partial remainder is safely large. */
+ n2p[qn] = cy;
+ ++rn;
+ }
+ }
+ }
+
+ quotient_too_large = 0;
+ if (cnt != 0)
+ {
+ mp_limb_t cy1, cy2;
+
+ /* Append partially used numerator limb to partial remainder. */
+ cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt);
+ n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt);
+
+ /* Update partial remainder with partially used divisor limb. */
+ cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt));
+ if (qn != rn)
+ {
+ if (n2p[qn] < cy2)
+ abort ();
+ n2p[qn] -= cy2;
+ }
+ else
+ {
+ n2p[qn] = cy1 - cy2;
+
+ quotient_too_large = (cy1 < cy2);
+ ++rn;
+ }
+ --in;
+ }
+ /* True: partial remainder now is neutral, i.e., it is not shifted up. */
+
+ tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+
+ if (in < qn)
+ {
+ if (in == 0)
+ {
+ MPN_COPY (rp, n2p, rn);
+ if (rn != dn)
+ abort ();
+ goto foo;
+ }
+ mpn_mul (tp, qp, qn, dp, in);
+ }
+ else
+ mpn_mul (tp, dp, in, qp, qn);
+
+ cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
+ MPN_COPY (rp + in, n2p, dn - in);
+ quotient_too_large |= cy;
+ cy = mpn_sub_n (rp, np, tp, in);
+ cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
+ quotient_too_large |= cy;
+ foo:
+ if (quotient_too_large)
+ {
+ mpn_decr_u (qp, (mp_limb_t) 1);
+ mpn_add_n (rp, rp, dp, dn);
+ }
+ }
+ TMP_FREE (marker);
+ return;
+ }
+ }
+}
diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c
new file mode 100644
index 0000000000..061cce86e1
--- /dev/null
+++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c
@@ -0,0 +1,131 @@
+/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed
+ division.
+
+ Contributed by Peter L. Montgomery.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE
+ TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE
+ GNU MP RELEASE.
+
+
+Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_udiv_w_sdiv (rp, a1, a0, d)
+ mp_limb_t *rp, a1, a0, d;
+{
+ mp_limb_t q, r;
+ mp_limb_t c0, c1, b1;
+
+ if ((mp_limb_signed_t) d >= 0)
+ {
+ if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1)))
+ {
+ /* dividend, divisor, and quotient are nonnegative */
+ sdiv_qrnnd (q, r, a1, a0, d);
+ }
+ else
+ {
+ /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
+ sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1));
+ /* Divide (c1*2^32 + c0) by d */
+ sdiv_qrnnd (q, r, c1, c0, d);
+ /* Add 2^31 to quotient */
+ q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+ }
+ }
+ else
+ {
+ b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */
+ c1 = a1 >> 1; /* A/2 */
+ c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1);
+
+ if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */
+ {
+ sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+ r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */
+ if ((d & 1) != 0)
+ {
+ if (r >= q)
+ r = r - q;
+ else if (q - r <= d)
+ {
+ r = r - q + d;
+ q--;
+ }
+ else
+ {
+ r = r - q + 2*d;
+ q -= 2;
+ }
+ }
+ }
+ else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */
+ {
+ c1 = (b1 - 1) - c1;
+ c0 = ~c0; /* logical NOT */
+
+ sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+ q = ~q; /* (A/2)/b1 */
+ r = (b1 - 1) - r;
+
+ r = 2*r + (a0 & 1); /* A/(2*b1) */
+
+ if ((d & 1) != 0)
+ {
+ if (r >= q)
+ r = r - q;
+ else if (q - r <= d)
+ {
+ r = r - q + d;
+ q--;
+ }
+ else
+ {
+ r = r - q + 2*d;
+ q -= 2;
+ }
+ }
+ }
+ else /* Implies c1 = b1 */
+ { /* Hence a1 = d - 1 = 2*b1 - 1 */
+ if (a0 >= -d)
+ {
+ q = -1;
+ r = a0 + d;
+ }
+ else
+ {
+ q = -2;
+ r = a0 + 2*d;
+ }
+ }
+ }
+
+ *rp = r;
+ return q;
+}