summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorNiels M?ller <nisse@lysator.liu.se>2020-01-24 04:21:19 +0100
committerNiels M?ller <nisse@lysator.liu.se>2020-01-24 04:21:19 +0100
commit04f529b01f59c3aea6098ea2eee83e6228373e86 (patch)
treeaf5a25189ecacb58b70dd3ed5764440d1d91d29b /mpn
parentf6c4822bc5ec0997553d5601a92cf557d6252a76 (diff)
downloadgmp-04f529b01f59c3aea6098ea2eee83e6228373e86.tar.gz
Move div1 and div2 to a separate file, hgcd2-div.h
Diffstat (limited to 'mpn')
-rw-r--r--mpn/generic/hgcd2-div.h490
-rw-r--r--mpn/generic/hgcd2.c453
2 files changed, 491 insertions, 452 deletions
diff --git a/mpn/generic/hgcd2-div.h b/mpn/generic/hgcd2-div.h
new file mode 100644
index 000000000..d420609ec
--- /dev/null
+++ b/mpn/generic/hgcd2-div.h
@@ -0,0 +1,490 @@
+/* hgcd2-div.h
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 3
+#endif
+
+#ifndef HGCD2_DIV2_METHOD
+#define HGCD2_DIV2_METHOD 2
+#endif
+
+#if HAVE_NATIVE_mpn_div_11
+
+#define div1 mpn_div_11
+/* Single-limb division optimized for small quotients.
+ Returned value holds d0 = r, d1 = q. */
+mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
+
+#elif HGCD2_DIV1_METHOD == 1
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ mp_double_limb_t res;
+ res.d1 = n0 / d0;
+ res.d0 = n0 - res.d1 * d0;
+
+ return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 2
+
+static mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ mp_double_limb_t res;
+ int ncnt, dcnt, cnt;
+ mp_limb_t q;
+ mp_limb_t mask;
+
+ ASSERT (n0 >= d0);
+
+ count_leading_zeros (ncnt, n0);
+ count_leading_zeros (dcnt, d0);
+ cnt = dcnt - ncnt;
+
+ d0 <<= cnt;
+
+ q = -(mp_limb_t) (n0 >= d0);
+ n0 -= d0 & q;
+ d0 >>= 1;
+ q = -q;
+
+ while (--cnt >= 0)
+ {
+ mask = -(mp_limb_t) (n0 >= d0);
+ n0 -= d0 & mask;
+ d0 >>= 1;
+ q = (q << 1) - mask;
+ }
+
+ res.d0 = n0;
+ res.d1 = q;
+ return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 3
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ mp_double_limb_t res;
+ if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
+ || UNLIKELY (n0 >= (d0 << 3)))
+ {
+ res.d1 = n0 / d0;
+ res.d0 = n0 - res.d1 * d0;
+ }
+ else
+ {
+ mp_limb_t q, mask;
+
+ d0 <<= 2;
+
+ mask = -(mp_limb_t) (n0 >= d0);
+ n0 -= d0 & mask;
+ q = 4 & mask;
+
+ d0 >>= 1;
+ mask = -(mp_limb_t) (n0 >= d0);
+ n0 -= d0 & mask;
+ q += 2 & mask;
+
+ d0 >>= 1;
+ mask = -(mp_limb_t) (n0 >= d0);
+ n0 -= d0 & mask;
+ q -= mask;
+
+ res.d0 = n0;
+ res.d1 = q;
+ }
+ return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 4
+
+/* Table quotients. We extract the NBITS most significant bits of the
+ numerator limb, and the corresponding bits from the divisor limb, and use
+ these to form an index into the table. This method is probably only useful
+ for short pipelines with slow multiplication.
+
+ Possible improvements:
+
+ * Perhaps extract the highest NBITS of the divisor instead of the same bits
+ as from the numerator. That would require another count_leading_zeros,
+ and a post-multiply shift of the quotient.
+
+ * Compress tables? Their values are tiny, and there are lots of zero
+ entries (which are never used).
+
+ * Round the table entries more cleverly?
+*/
+
+#ifndef NBITS
+#define NBITS 5
+#endif
+
+#if NBITS == 5
+/* This needs full division about 13.2% of the time. */
+static const unsigned char tab[512] = {
+17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
+23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
+24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
+26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
+30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
+31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
+32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+#elif NBITS == 6
+/* This needs full division about 9.8% of the time. */
+static const unsigned char tab[2048] = {
+33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
+55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
+56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
+58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
+62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
+63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
+64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+#else
+#error No table for provided NBITS
+#endif
+
+static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ int ncnt;
+ size_t nbi, dbi;
+ mp_limb_t q0;
+ mp_limb_t r0;
+ mp_limb_t mask;
+ mp_double_limb_t res;
+
+ ASSERT (n0 >= d0); /* Actually only msb position is critical. */
+
+ count_leading_zeros (ncnt, n0);
+ nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
+ dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
+
+ q0 = tabp[(nbi << NBITS) + dbi];
+ r0 = n0 - q0 * d0;
+ mask = -(mp_limb_t) (r0 >= d0);
+ q0 -= mask;
+ r0 -= d0 & mask;
+
+ if (UNLIKELY (r0 >= d0))
+ {
+ q0 = n0 / d0;
+ r0 = n0 - q0 * d0;
+ }
+
+ res.d1 = q0;
+ res.d0 = r0;
+ return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 5
+
+/* Table inverses of divisors. We don't bother with suppressing the msb from
+ the tables. We index with the NBITS most significant divisor bits,
+ including the always-set highest bit, but use addressing trickery via tabp
+ to suppress it.
+
+ Possible improvements:
+
+ * Do first multiply using 32-bit operations on 64-bit computers. At least
+ on most Arm64 cores, that uses 3 times less resources. It also saves on
+ many x86-64 processors.
+*/
+
+#ifndef NBITS
+#define NBITS 7
+#endif
+
+#if NBITS == 5
+/* This needs full division about 1.63% of the time. */
+static const unsigned char tab[16] = {
+ 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 6
+/* This needs full division about 0.93% of the time. */
+static const unsigned char tab[32] = {
+127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
+ 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 7
+/* This needs full division about 0.49% of the time. */
+static const unsigned char tab[64] = {
+255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
+203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
+169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
+145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 8
+/* This needs full division about 0.26% of the time. */
+static const unsigned short tab[128] = {
+511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
+454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
+408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
+371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
+340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
+314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
+291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
+272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
+};
+static const unsigned short *tabp = tab - (1 << (NBITS - 1));
+#else
+#error No table for provided NBITS
+#endif
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+ int ncnt, dcnt;
+ size_t dbi;
+ mp_limb_t inv;
+ mp_limb_t q0;
+ mp_limb_t r0;
+ mp_limb_t mask;
+ mp_double_limb_t res;
+
+ count_leading_zeros (ncnt, n0);
+ count_leading_zeros (dcnt, d0);
+
+ dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
+ inv = tabp[dbi];
+ q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
+ r0 = n0 - q0 * d0;
+ mask = -(mp_limb_t) (r0 >= d0);
+ q0 -= mask;
+ r0 -= d0 & mask;
+
+ if (UNLIKELY (r0 >= d0))
+ {
+ q0 = n0 / d0;
+ r0 = n0 - q0 * d0;
+ }
+
+ res.d1 = q0;
+ res.d0 = r0;
+ return res;
+}
+
+#else
+#error Unknown HGCD2_DIV1_METHOD
+#endif
+
+#if HAVE_NATIVE_mpn_div_22
+
+#define div2 mpn_div_22
+/* Two-limb division optimized for small quotients. */
+mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
+
+#elif HGCD2_DIV2_METHOD == 1
+
+static mp_limb_t
+div2 (mp_ptr rp,
+ mp_limb_t n1, mp_limb_t n0,
+ mp_limb_t d1, mp_limb_t d0)
+{
+ mp_double_limb_t rq = div1 (n1, d1);
+ if (UNLIKELY (rq.d1 > d1))
+ {
+ mp_limb_t n2, q, t1, t0;
+ int c;
+
+ /* Normalize */
+ count_leading_zeros (c, d1);
+ ASSERT (c > 0);
+
+ n2 = n1 >> (GMP_LIMB_BITS - c);
+ n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
+ n0 <<= c;
+ d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
+ d0 <<= c;
+
+ udiv_qrnnd (q, n1, n2, n1, d1);
+ umul_ppmm (t1, t0, q, d0);
+ if (t1 > n1 || (t1 == n1 && t0 > n0))
+ {
+ ASSERT (q > 0);
+ q--;
+ sub_ddmmss (t1, t0, t1, t0, d1, d0);
+ }
+ sub_ddmmss (n1, n0, n1, n0, t1, t0);
+
+ /* Undo normalization */
+ rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
+ rp[1] = n1 >> c;
+
+ return q;
+ }
+ else
+ {
+ mp_limb_t q, t1, t0;
+ n1 = rq.d0;
+ q = rq.d1;
+ umul_ppmm (t1, t0, q, d0);
+ if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
+ {
+ ASSERT (q > 0);
+ q--;
+ sub_ddmmss (t1, t0, t1, t0, d1, d0);
+ }
+ sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
+ return q;
+ }
+}
+
+#elif HGCD2_DIV2_METHOD == 2
+
+/* Bit-wise div2. Relies on fast count_leading_zeros. */
+static mp_limb_t
+div2 (mp_ptr rp,
+ mp_limb_t n1, mp_limb_t n0,
+ mp_limb_t d1, mp_limb_t d0)
+{
+ mp_limb_t q = 0;
+ int ncnt;
+ int dcnt;
+
+ count_leading_zeros (ncnt, n1);
+ count_leading_zeros (dcnt, d1);
+ dcnt -= ncnt;
+
+ d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
+ d0 <<= dcnt;
+
+ do
+ {
+ mp_limb_t mask;
+ q <<= 1;
+ if (UNLIKELY (n1 == d1))
+ mask = -(n0 >= d0);
+ else
+ mask = -(n1 > d1);
+
+ q -= mask;
+
+ sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
+
+ d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
+ d1 = d1 >> 1;
+ }
+ while (dcnt--);
+
+ rp[0] = n0;
+ rp[1] = n1;
+
+ return q;
+}
+#else
+#error Unknown HGCD2_DIV2_METHOD
+#endif
diff --git a/mpn/generic/hgcd2.c b/mpn/generic/hgcd2.c
index 3fa40122e..43d4d4813 100644
--- a/mpn/generic/hgcd2.c
+++ b/mpn/generic/hgcd2.c
@@ -36,463 +36,12 @@ see https://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
-#ifndef HGCD2_DIV1_METHOD
-#define HGCD2_DIV1_METHOD 3
-#endif
-
-#ifndef HGCD2_DIV2_METHOD
-#define HGCD2_DIV2_METHOD 2
-#endif
+#include "mpn/generic/hgcd2-div.h"
#if GMP_NAIL_BITS != 0
#error Nails not implemented
#endif
-#if HAVE_NATIVE_mpn_div_11
-
-#define div1 mpn_div_11
-/* Single-limb division optimized for small quotients.
- Returned value holds d0 = r, d1 = q. */
-mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
-
-#elif HGCD2_DIV1_METHOD == 1
-
-static inline mp_double_limb_t
-div1 (mp_limb_t n0, mp_limb_t d0)
-{
- mp_double_limb_t res;
- res.d1 = n0 / d0;
- res.d0 = n0 - res.d1 * d0;
-
- return res;
-}
-
-#elif HGCD2_DIV1_METHOD == 2
-
-static mp_double_limb_t
-div1 (mp_limb_t n0, mp_limb_t d0)
-{
- mp_double_limb_t res;
- int ncnt, dcnt, cnt;
- mp_limb_t q;
- mp_limb_t mask;
-
- ASSERT (n0 >= d0);
-
- count_leading_zeros (ncnt, n0);
- count_leading_zeros (dcnt, d0);
- cnt = dcnt - ncnt;
-
- d0 <<= cnt;
-
- q = -(mp_limb_t) (n0 >= d0);
- n0 -= d0 & q;
- d0 >>= 1;
- q = -q;
-
- while (--cnt >= 0)
- {
- mask = -(mp_limb_t) (n0 >= d0);
- n0 -= d0 & mask;
- d0 >>= 1;
- q = (q << 1) - mask;
- }
-
- res.d0 = n0;
- res.d1 = q;
- return res;
-}
-
-#elif HGCD2_DIV1_METHOD == 3
-
-static inline mp_double_limb_t
-div1 (mp_limb_t n0, mp_limb_t d0)
-{
- mp_double_limb_t res;
- if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
- || UNLIKELY (n0 >= (d0 << 3)))
- {
- res.d1 = n0 / d0;
- res.d0 = n0 - res.d1 * d0;
- }
- else
- {
- mp_limb_t q, mask;
-
- d0 <<= 2;
-
- mask = -(mp_limb_t) (n0 >= d0);
- n0 -= d0 & mask;
- q = 4 & mask;
-
- d0 >>= 1;
- mask = -(mp_limb_t) (n0 >= d0);
- n0 -= d0 & mask;
- q += 2 & mask;
-
- d0 >>= 1;
- mask = -(mp_limb_t) (n0 >= d0);
- n0 -= d0 & mask;
- q -= mask;
-
- res.d0 = n0;
- res.d1 = q;
- }
- return res;
-}
-
-#elif HGCD2_DIV1_METHOD == 4
-
-/* Table quotients. We extract the NBITS most significant bits of the
- numerator limb, and the corresponding bits from the divisor limb, and use
- these to form an index into the table. This method is probably only useful
- for short pipelines with slow multiplication.
-
- Possible improvements:
-
- * Perhaps extract the highest NBITS of the divisor instead of the same bits
- as from the numerator. That would require another count_leading_zeros,
- and a post-multiply shift of the quotient.
-
- * Compress tables? Their values are tiny, and there are lots of zero
- entries (which are never used).
-
- * Round the table entries more cleverly?
-*/
-
-#ifndef NBITS
-#define NBITS 5
-#endif
-
-#if NBITS == 5
-/* This needs full division about 13.2% of the time. */
-static const unsigned char tab[512] = {
-17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
-20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
-21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
-22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
-23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
-24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
-25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
-26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
-27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
-28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
-29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
-30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
-31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
-32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-};
-#elif NBITS == 6
-/* This needs full division about 9.8% of the time. */
-static const unsigned char tab[2048] = {
-33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
-52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
-53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
-54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
-55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
-56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
-57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
-58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
-59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
-60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
-61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
-62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
-63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
-64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
- 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-};
-#else
-#error No table for provided NBITS
-#endif
-
-static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
-
-static inline mp_double_limb_t
-div1 (mp_limb_t n0, mp_limb_t d0)
-{
- int ncnt;
- size_t nbi, dbi;
- mp_limb_t q0;
- mp_limb_t r0;
- mp_limb_t mask;
- mp_double_limb_t res;
-
- ASSERT (n0 >= d0); /* Actually only msb position is critical. */
-
- count_leading_zeros (ncnt, n0);
- nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
- dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
-
- q0 = tabp[(nbi << NBITS) + dbi];
- r0 = n0 - q0 * d0;
- mask = -(mp_limb_t) (r0 >= d0);
- q0 -= mask;
- r0 -= d0 & mask;
-
- if (UNLIKELY (r0 >= d0))
- {
- q0 = n0 / d0;
- r0 = n0 - q0 * d0;
- }
-
- res.d1 = q0;
- res.d0 = r0;
- return res;
-}
-
-#elif HGCD2_DIV1_METHOD == 5
-
-/* Table inverses of divisors. We don't bother with suppressing the msb from
- the tables. We index with the NBITS most significant divisor bits,
- including the always-set highest bit, but use addressing trickery via tabp
- to suppress it.
-
- Possible improvements:
-
- * Do first multiply using 32-bit operations on 64-bit computers. At least
- on most Arm64 cores, that uses 3 times less resources. It also saves on
- many x86-64 processors.
-*/
-
-#ifndef NBITS
-#define NBITS 7
-#endif
-
-#if NBITS == 5
-/* This needs full division about 1.63% of the time. */
-static const unsigned char tab[16] = {
- 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
-};
-static const unsigned char *tabp = tab - (1 << (NBITS - 1));
-#elif NBITS == 6
-/* This needs full division about 0.93% of the time. */
-static const unsigned char tab[32] = {
-127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
- 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
-};
-static const unsigned char *tabp = tab - (1 << (NBITS - 1));
-#elif NBITS == 7
-/* This needs full division about 0.49% of the time. */
-static const unsigned char tab[64] = {
-255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
-203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
-169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
-145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
-};
-static const unsigned char *tabp = tab - (1 << (NBITS - 1));
-#elif NBITS == 8
-/* This needs full division about 0.26% of the time. */
-static const unsigned short tab[128] = {
-511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
-454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
-408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
-371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
-340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
-314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
-291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
-272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
-};
-static const unsigned short *tabp = tab - (1 << (NBITS - 1));
-#else
-#error No table for provided NBITS
-#endif
-
-static inline mp_double_limb_t
-div1 (mp_limb_t n0, mp_limb_t d0)
-{
- int ncnt, dcnt;
- size_t dbi;
- mp_limb_t inv;
- mp_limb_t q0;
- mp_limb_t r0;
- mp_limb_t mask;
- mp_double_limb_t res;
-
- count_leading_zeros (ncnt, n0);
- count_leading_zeros (dcnt, d0);
-
- dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
- inv = tabp[dbi];
- q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
- r0 = n0 - q0 * d0;
- mask = -(mp_limb_t) (r0 >= d0);
- q0 -= mask;
- r0 -= d0 & mask;
-
- if (UNLIKELY (r0 >= d0))
- {
- q0 = n0 / d0;
- r0 = n0 - q0 * d0;
- }
-
- res.d1 = q0;
- res.d0 = r0;
- return res;
-}
-
-#else
-#error Unknown HGCD2_DIV1_METHOD
-#endif
-
-#if HAVE_NATIVE_mpn_div_22
-
-#define div2 mpn_div_22
-/* Two-limb division optimized for small quotients. */
-mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
-
-#elif HGCD2_DIV2_METHOD == 1
-
-static mp_limb_t
-div2 (mp_ptr rp,
- mp_limb_t n1, mp_limb_t n0,
- mp_limb_t d1, mp_limb_t d0)
-{
- mp_double_limb_t rq = div1 (n1, d1);
- if (UNLIKELY (rq.d1 > d1))
- {
- mp_limb_t n2, q, t1, t0;
- int c;
-
- /* Normalize */
- count_leading_zeros (c, d1);
- ASSERT (c > 0);
-
- n2 = n1 >> (GMP_LIMB_BITS - c);
- n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
- n0 <<= c;
- d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
- d0 <<= c;
-
- udiv_qrnnd (q, n1, n2, n1, d1);
- umul_ppmm (t1, t0, q, d0);
- if (t1 > n1 || (t1 == n1 && t0 > n0))
- {
- ASSERT (q > 0);
- q--;
- sub_ddmmss (t1, t0, t1, t0, d1, d0);
- }
- sub_ddmmss (n1, n0, n1, n0, t1, t0);
-
- /* Undo normalization */
- rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
- rp[1] = n1 >> c;
-
- return q;
- }
- else
- {
- mp_limb_t q, t1, t0;
- n1 = rq.d0;
- q = rq.d1;
- umul_ppmm (t1, t0, q, d0);
- if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
- {
- ASSERT (q > 0);
- q--;
- sub_ddmmss (t1, t0, t1, t0, d1, d0);
- }
- sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
- return q;
- }
-}
-
-#elif HGCD2_DIV2_METHOD == 2
-
-/* Bit-wise div2. Relies on fast count_leading_zeros. */
-static mp_limb_t
-div2 (mp_ptr rp,
- mp_limb_t n1, mp_limb_t n0,
- mp_limb_t d1, mp_limb_t d0)
-{
- mp_limb_t q = 0;
- int ncnt;
- int dcnt;
-
- count_leading_zeros (ncnt, n1);
- count_leading_zeros (dcnt, d1);
- dcnt -= ncnt;
-
- d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
- d0 <<= dcnt;
-
- do
- {
- mp_limb_t mask;
- q <<= 1;
- if (UNLIKELY (n1 == d1))
- mask = -(n0 >= d0);
- else
- mask = -(n1 > d1);
-
- q -= mask;
-
- sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
-
- d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
- d1 = d1 >> 1;
- }
- while (dcnt--);
-
- rp[0] = n0;
- rp[1] = n1;
-
- return q;
-}
-#else
-#error Unknown HGCD2_DIV2_METHOD
-#endif
-
/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
matrix M. Returns 1 if we make progress, i.e. can perform at least
one subtraction. Otherwise returns zero. */