1 files changed, 650 insertions, 0 deletions
diff --git a/Modules/_decimal/libmpdec/umodarith.h b/Modules/_decimal/libmpdec/umodarith.h
new file mode 100644
index 0000000000..68d15188cb
--- /dev/null
+++ b/Modules/_decimal/libmpdec/umodarith.h
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef UMODARITH_H
+#define UMODARITH_H
+
+
+#include "constants.h"
+#include "mpdecimal.h"
+#include "typearith.h"
+
+
+/* Bignum: Low level routines for unsigned modular arithmetic. These are
+   used in the fast convolution functions for very large coefficients. */
+
+
+/**************************************************************************/
+/*                        ANSI modular arithmetic                         */
+/**************************************************************************/
+
+
+/*
+ * Restrictions: a < m and b < m
+ * ACL2 proof: umodarith.lisp: addmod-correct
+ */
+static inline mpd_uint_t
+addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
+{
+    mpd_uint_t s;
+
+    s = a + b;
+    s = (s < a) ? s - m : s;
+    s = (s >= m) ? s - m : s;
+
+    return s;
+}
+
+/*
+ * Restrictions: a < m and b < m
+ * ACL2 proof: umodarith.lisp: submod-2-correct
+ */
+static inline mpd_uint_t
+submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
+{
+    mpd_uint_t d;
+
+    d = a - b;
+    d = (a < b) ? d + m : d;
+
+    return d;
+}
+
+/*
+ * Restrictions: a < 2m and b < 2m
+ * ACL2 proof: umodarith.lisp: section ext-submod
+ */
+static inline mpd_uint_t
+ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
+{
+    mpd_uint_t d;
+
+    a = (a >= m) ? a - m : a;
+    b = (b >= m) ? b - m : b;
+
+    d = a - b;
+    d = (a < b) ? d + m : d;
+
+    return d;
+}
+
+/*
+ * Reduce double word modulo m.
+ * Restrictions: m != 0
+ * ACL2 proof: umodarith.lisp: section dw-reduce
+ */
+static inline mpd_uint_t
+dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
+{
+    mpd_uint_t r1, r2, w;
+
+    _mpd_div_word(&w, &r1, hi, m);
+    _mpd_div_words(&w, &r2, r1, lo, m);
+
+    return r2;
+}
+
+/*
+ * Subtract double word from a.
+ * Restrictions: a < m
+ * ACL2 proof: umodarith.lisp: section dw-submod
+ */
+static inline mpd_uint_t
+dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
+{
+    mpd_uint_t d, r;
+
+    r = dw_reduce(hi, lo, m);
+    d = a - r;
+    d = (a < r) ? d + m : d;
+
+    return d;
+}
+
+#ifdef CONFIG_64
+
+/**************************************************************************/
+/*                        64-bit modular arithmetic                       */
+/**************************************************************************/
+
+/*
+ * A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
+ * proof is in umodarith.lisp: section "Fast modular reduction".
+ *
+ * Algorithm: calculate (a * b) % p:
+ *
+ *   a) hi, lo <- a * b       # Calculate a * b.
+ *
+ *   b) hi, lo <-  R(hi, lo)  # Reduce modulo p.
+ *
+ *   c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
+ *
+ *   d) If the result is less than p, return lo. Otherwise return lo - p.
+ */
+
+static inline mpd_uint_t
+x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
+{
+    mpd_uint_t hi, lo, x, y;
+
+
+    _mpd_mul_words(&hi, &lo, a, b);
+
+    if (m & (1ULL<<32)) { /* P1 */
+
+        /* first reduction */
+        x = y = hi;
+        hi >>= 32;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 32;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        /* second reduction */
+        x = y = hi;
+        hi >>= 32;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 32;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        return (hi || lo >= m ? lo - m : lo);
+    }
+    else if (m & (1ULL<<34)) { /* P2 */
+
+        /* first reduction */
+        x = y = hi;
+        hi >>= 30;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 34;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        /* second reduction */
+        x = y = hi;
+        hi >>= 30;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 34;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        /* third reduction */
+        x = y = hi;
+        hi >>= 30;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 34;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        return (hi || lo >= m ? lo - m : lo);
+    }
+    else { /* P3 */
+
+        /* first reduction */
+        x = y = hi;
+        hi >>= 24;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 40;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        /* second reduction */
+        x = y = hi;
+        hi >>= 24;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 40;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        /* third reduction */
+        x = y = hi;
+        hi >>= 24;
+
+        x = lo - x;
+        if (x > lo) hi--;
+
+        y <<= 40;
+        lo = y + x;
+        if (lo < y) hi++;
+
+        return (hi || lo >= m ? lo - m : lo);
+    }
+}
+
+static inline void
+x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
+{
+    *a = x64_mulmod(*a, w, m);
+    *b = x64_mulmod(*b, w, m);
+}
+
+static inline void
+x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
+            mpd_uint_t m)
+{
+    *a0 = x64_mulmod(*a0, b0, m);
+    *a1 = x64_mulmod(*a1, b1, m);
+}
+
+static inline mpd_uint_t
+x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
+{
+    mpd_uint_t r = 1;
+
+    while (exp > 0) {
+        if (exp & 1)
+            r = x64_mulmod(r, base, umod);
+        base = x64_mulmod(base, base, umod);
+        exp >>= 1;
+    }
+
+    return r;
+}
+
+/* END CONFIG_64 */
+#else /* CONFIG_32 */
+
+
+/**************************************************************************/
+/*                        32-bit modular arithmetic                       */
+/**************************************************************************/
+
+#if defined(ANSI)
+#if !defined(LEGACY_COMPILER)
+/* HAVE_UINT64_T */
+static inline mpd_uint_t
+std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
+{
+    return ((mpd_uuint_t) a * b) % m;
+}
+
+static inline void
+std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
+{
+    *a = ((mpd_uuint_t) *a * w) % m;
+    *b = ((mpd_uuint_t) *b * w) % m;
+}
+
+static inline void
+std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
+            mpd_uint_t m)
+{
+    *a0 = ((mpd_uuint_t) *a0 * b0) % m;
+    *a1 = ((mpd_uuint_t) *a1 * b1) % m;
+}
+/* END HAVE_UINT64_T */
+#else
+/* LEGACY_COMPILER */
+static inline mpd_uint_t
+std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
+{
+    mpd_uint_t hi, lo, q, r;
+    _mpd_mul_words(&hi, &lo, a, b);
+    _mpd_div_words(&q, &r, hi, lo, m);
+    return r;
+}
+
+static inline void
+std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
+{
+    *a = std_mulmod(*a, w, m);
+    *b = std_mulmod(*b, w, m);
+}
+
+static inline void
+std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
+            mpd_uint_t m)
+{
+    *a0 = std_mulmod(*a0, b0, m);
+    *a1 = std_mulmod(*a1, b1, m);
+}
+/* END LEGACY_COMPILER */
+#endif
+
+static inline mpd_uint_t
+std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
+{
+    mpd_uint_t r = 1;
+
+    while (exp > 0) {
+        if (exp & 1)
+            r = std_mulmod(r, base, umod);
+        base = std_mulmod(base, base, umod);
+        exp >>= 1;
+    }
+
+    return r;
+}
+#endif /* ANSI CONFIG_32 */
+
+
+/**************************************************************************/
+/*                    Pentium Pro modular arithmetic                      */
+/**************************************************************************/
+
+/*
+ * A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
+ * control word must be set to 64-bit precision and truncation mode
+ * prior to using these functions.
+ *
+ * Algorithm: calculate (a * b) % p:
+ *
+ *   p    := prime < 2**31
+ *   pinv := (long double)1.0 / p (precalculated)
+ *
+ *   a) n = a * b              # Calculate exact product.
+ *   b) qest = n * pinv        # Calculate estimate for q = n / p.
+ *   c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
+ *   d) r = n - q * p          # Calculate remainder.
+ *
+ * Remarks:
+ *
+ *   - p = dmod and pinv = dinvmod.
+ *   - dinvmod points to an array of three uint32_t, which is interpreted
+ *     as an 80 bit long double by fldt.
+ *   - Intel compilers prior to version 11 do not seem to handle the
+ *     __GNUC__ inline assembly correctly.
+ *   - random tests are provided in tests/extended/ppro_mulmod.c
+ */
+
+#if defined(PPRO)
+#if defined(ASM)
+
+/* Return (a * b) % dmod */
+static inline mpd_uint_t
+ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
+{
+    mpd_uint_t retval;
+
+    __asm__ (
+            "fildl  %2\n\t"
+            "fildl  %1\n\t"
+            "fmulp  %%st, %%st(1)\n\t"
+            "fldt   (%4)\n\t"
+            "fmul   %%st(1), %%st\n\t"
+            "flds   %5\n\t"
+            "fadd   %%st, %%st(1)\n\t"
+            "fsubrp %%st, %%st(1)\n\t"
+            "fldl   (%3)\n\t"
+            "fmulp  %%st, %%st(1)\n\t"
+            "fsubrp %%st, %%st(1)\n\t"
+            "fistpl %0\n\t"
+            : "=m" (retval)
+            : "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
+            : "st", "memory"
+    );
+
+    return retval;
+}
+
+/*
+ * Two modular multiplications in parallel:
+ *      *a0 = (*a0 * w) % dmod
+ *      *a1 = (*a1 * w) % dmod
+ */
+static inline void
+ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
+              double *dmod, uint32_t *dinvmod)
+{
+    __asm__ (
+            "fildl  %2\n\t"
+            "fildl  (%1)\n\t"
+            "fmul   %%st(1), %%st\n\t"
+            "fxch   %%st(1)\n\t"
+            "fildl  (%0)\n\t"
+            "fmulp  %%st, %%st(1) \n\t"
+            "fldt   (%4)\n\t"
+            "flds   %5\n\t"
+            "fld    %%st(2)\n\t"
+            "fmul   %%st(2)\n\t"
+            "fadd   %%st(1)\n\t"
+            "fsub   %%st(1)\n\t"
+            "fmull  (%3)\n\t"
+            "fsubrp %%st, %%st(3)\n\t"
+            "fxch   %%st(2)\n\t"
+            "fistpl (%0)\n\t"
+            "fmul   %%st(2)\n\t"
+            "fadd   %%st(1)\n\t"
+            "fsubp  %%st, %%st(1)\n\t"
+            "fmull  (%3)\n\t"
+            "fsubrp %%st, %%st(1)\n\t"
+            "fistpl (%1)\n\t"
+            : : "r" (a0), "r" (a1), "m" (w),
+                "r" (dmod), "r" (dinvmod),
+                "m" (MPD_TWO63)
+            : "st", "memory"
+    );
+}
+
+/*
+ * Two modular multiplications in parallel:
+ *      *a0 = (*a0 * b0) % dmod
+ *      *a1 = (*a1 * b1) % dmod
+ */
+static inline void
+ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
+             double *dmod, uint32_t *dinvmod)
+{
+    __asm__ (
+            "fildl  %3\n\t"
+            "fildl  (%2)\n\t"
+            "fmulp  %%st, %%st(1)\n\t"
+            "fildl  %1\n\t"
+            "fildl  (%0)\n\t"
+            "fmulp  %%st, %%st(1)\n\t"
+            "fldt   (%5)\n\t"
+            "fld    %%st(2)\n\t"
+            "fmul   %%st(1), %%st\n\t"
+            "fxch   %%st(1)\n\t"
+            "fmul   %%st(2), %%st\n\t"
+            "flds   %6\n\t"
+            "fldl   (%4)\n\t"
+            "fxch   %%st(3)\n\t"
+            "fadd   %%st(1), %%st\n\t"
+            "fxch   %%st(2)\n\t"
+            "fadd   %%st(1), %%st\n\t"
+            "fxch   %%st(2)\n\t"
+            "fsub   %%st(1), %%st\n\t"
+            "fxch   %%st(2)\n\t"
+            "fsubp  %%st, %%st(1)\n\t"
+            "fxch   %%st(1)\n\t"
+            "fmul   %%st(2), %%st\n\t"
+            "fxch   %%st(1)\n\t"
+            "fmulp  %%st, %%st(2)\n\t"
+            "fsubrp %%st, %%st(3)\n\t"
+            "fsubrp %%st, %%st(1)\n\t"
+            "fxch   %%st(1)\n\t"
+            "fistpl (%2)\n\t"
+            "fistpl (%0)\n\t"
+            : : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
+                "r" (dmod), "r" (dinvmod),
+                "m" (MPD_TWO63)
+            : "st", "memory"
+    );
+}
+/* END PPRO GCC ASM */
+#elif defined(MASM)
+
+/* Return (a * b) % dmod */
+static inline mpd_uint_t __cdecl
+ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
+{
+    mpd_uint_t retval;
+
+    __asm {
+        mov     eax, dinvmod
+        mov     edx, dmod
+        fild    b
+        fild    a
+        fmulp   st(1), st
+        fld     TBYTE PTR [eax]
+        fmul    st, st(1)
+        fld     MPD_TWO63
+        fadd    st(1), st
+        fsubp   st(1), st
+        fld     QWORD PTR [edx]
+        fmulp   st(1), st
+        fsubp   st(1), st
+        fistp   retval
+    }
+
+    return retval;
+}
+
+/*
+ * Two modular multiplications in parallel:
+ *      *a0 = (*a0 * w) % dmod
+ *      *a1 = (*a1 * w) % dmod
+ */
+static inline mpd_uint_t __cdecl
+ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
+              double *dmod, uint32_t *dinvmod)
+{
+    __asm {
+        mov     ecx, dmod
+        mov     edx, a1
+        mov     ebx, dinvmod
+        mov     eax, a0
+        fild    w
+        fild    DWORD PTR [edx]
+        fmul    st, st(1)
+        fxch    st(1)
+        fild    DWORD PTR [eax]
+        fmulp   st(1), st
+        fld     TBYTE PTR [ebx]
+        fld     MPD_TWO63
+        fld     st(2)
+        fmul    st, st(2)
+        fadd    st, st(1)
+        fsub    st, st(1)
+        fmul    QWORD PTR [ecx]
+        fsubp   st(3), st
+        fxch    st(2)
+        fistp   DWORD PTR [eax]
+        fmul    st, st(2)
+        fadd    st, st(1)
+        fsubrp  st(1), st
+        fmul    QWORD PTR [ecx]
+        fsubp   st(1), st
+        fistp   DWORD PTR [edx]
+    }
+}
+
+/*
+ * Two modular multiplications in parallel:
+ *      *a0 = (*a0 * b0) % dmod
+ *      *a1 = (*a1 * b1) % dmod
+ */
+static inline void __cdecl
+ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
+             double *dmod, uint32_t *dinvmod)
+{
+    __asm {
+        mov     ecx, dmod
+        mov     edx, a1
+        mov     ebx, dinvmod
+        mov     eax, a0
+        fild    b1
+        fild    DWORD PTR [edx]
+        fmulp   st(1), st
+        fild    b0
+        fild    DWORD PTR [eax]
+        fmulp   st(1), st
+        fld     TBYTE PTR [ebx]
+        fld     st(2)
+        fmul    st, st(1)
+        fxch    st(1)
+        fmul    st, st(2)
+        fld     DWORD PTR MPD_TWO63
+        fld     QWORD PTR [ecx]
+        fxch    st(3)
+        fadd    st, st(1)
+        fxch    st(2)
+        fadd    st, st(1)
+        fxch    st(2)
+        fsub    st, st(1)
+        fxch    st(2)
+        fsubrp  st(1), st
+        fxch    st(1)
+        fmul    st, st(2)
+        fxch    st(1)
+        fmulp   st(2), st
+        fsubp   st(3), st
+        fsubp   st(1), st
+        fxch    st(1)
+        fistp   DWORD PTR [edx]
+        fistp   DWORD PTR [eax]
+    }
+}
+#endif /* PPRO MASM (_MSC_VER) */
+
+
+/* Return (base ** exp) % dmod */
+static inline mpd_uint_t
+ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
+{
+    mpd_uint_t r = 1;
+
+    while (exp > 0) {
+        if (exp & 1)
+            r = ppro_mulmod(r, base, dmod, dinvmod);
+        base = ppro_mulmod(base, base, dmod, dinvmod);
+        exp >>= 1;
+    }
+
+    return r;
+}
+#endif /* PPRO */
+#endif /* CONFIG_32 */
+
+
+#endif /* UMODARITH_H */
+
+
+