diff options
author | Marco Bodrato <bodrato@mail.dm.unipi.it> | 2015-08-06 04:14:32 +0200 |
---|---|---|
committer | Marco Bodrato <bodrato@mail.dm.unipi.it> | 2015-08-06 04:14:32 +0200 |
commit | 2ea7d2279f7c6d77b1d4ccee427a75c8f61a132e (patch) | |
tree | 5f7029c4d926a02062b80e02c18151848db78159 | |
parent | 1ecd3e90d57e06293843327cae934245afcc873b (diff) | |
download | gmp-2ea7d2279f7c6d77b1d4ccee427a75c8f61a132e.tar.gz |
mpn/generic/sqrlo{,_basecase.c}: New files.
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | gmp-impl.h | 6 | ||||
-rw-r--r-- | mpn/generic/sqrlo.c | 246 | ||||
-rw-r--r-- | mpn/generic/sqrlo_basecase.c | 108 |
4 files changed, 361 insertions, 1 deletions
diff --git a/configure.ac b/configure.ac index 7c8b3682c..788542ca7 100644 --- a/configure.ac +++ b/configure.ac @@ -2860,7 +2860,7 @@ gmp_mpn_functions="$extra_functions \ matrix22_mul matrix22_mul1_inverse_vector \ hgcd_matrix hgcd2 hgcd_step hgcd_reduce hgcd hgcd_appr \ hgcd2_jacobi hgcd_jacobi \ - mullo_n mullo_basecase \ + mullo_n mullo_basecase sqrlo sqrlo_basecase \ toom22_mul toom32_mul toom42_mul toom52_mul toom62_mul \ toom33_mul toom43_mul toom53_mul toom54_mul toom63_mul \ toom44_mul \ diff --git a/gmp-impl.h b/gmp-impl.h index b5f8b05fb..7f807337a 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -1084,6 +1084,12 @@ __GMP_DECLSPEC void mpn_mullo_basecase (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t) __GMP_DECLSPEC void mpn_sqr_basecase (mp_ptr, mp_srcptr, mp_size_t); #endif +#define mpn_sqrlo __MPN(sqrlo) +__GMP_DECLSPEC void mpn_sqrlo (mp_ptr, mp_srcptr, mp_size_t); + +#define mpn_sqrlo_basecase __MPN(sqrlo_basecase) +__GMP_DECLSPEC void mpn_sqrlo_basecase (mp_ptr, mp_srcptr, mp_size_t); + #define mpn_mulmid_basecase __MPN(mulmid_basecase) __GMP_DECLSPEC void mpn_mulmid_basecase (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t); diff --git a/mpn/generic/sqrlo.c b/mpn/generic/sqrlo.c new file mode 100644 index 000000000..ffc2ef722 --- /dev/null +++ b/mpn/generic/sqrlo.c @@ -0,0 +1,246 @@ +/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs + of the result. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS + FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#ifndef SQRLO_BASECASE_THRESHOLD_LIMIT +#define SQRLO_BASECASE_THRESHOLD_LIMIT 200 +#endif +#ifndef SQRLO_BASECASE_THRESHOLD +#define SQRLO_BASECASE_THRESHOLD 0 +#endif +#ifndef SQRLO_DC_THRESHOLD +#define SQRLO_DC_THRESHOLD (2*SQR_TOOM2_THRESHOLD) +#endif +#ifndef SQRLO_SQR_THRESHOLD +#define SQRLO_SQR_THRESHOLD (2*SQR_FFT_THRESHOLD) +#endif + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_range_basecase 1 +#define MAYBE_range_toom22 1 +#else +#define MAYBE_range_basecase \ + ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11)) +#define MAYBE_range_toom22 \ + ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) ) +#endif + +/* THINK: The DC strategy uses different constants in different Toom's + ranges. Something smoother? +*/ + +/* + Compute the least significant half of the product {xy,n}*{yp,n}, or + formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). + + Above the given threshold, the Divide and Conquer strategy is used. + The operand is split in two, and a full square plus a mullo + is used to obtain the final result. The more natural strategy is to + split in two halves, but this is far from optimal when a + sub-quadratic multiplication is used. + + Mulders suggests an unbalanced split in favour of the full product, + split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. + + To compute the value of a, we assume that the cost of mullo for a + given size ML(n) is a fraction of the cost of a full product with + same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; + then we can write: + + ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e + + Given a value for e, want to minimise the value of k, i.e. the + function k=(1-a)^e/(1-2*a^e). + + With e=2, the exponent for schoolbook multiplication, the minimum is + given by the values a=1-a=1/2. + + With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), + Mulders compute (1-a) = 0.694... and we approximate a with 11/36. + + Other possible approximations follow: + e=log(5)/log(3) [Toom-3] -> a ~= 9/40 + e=log(7)/log(4) [Toom-4] -> a ~= 7/39 + e=log(11)/log(6) [Toom-6] -> a ~= 1/8 + e=log(15)/log(8) [Toom-8] -> a ~= 1/10 + + The values above where obtained with the following trivial commands + in the gp-pari shell: + +fun(e,a)=(1-a)^e/(1-2*a^e) +mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))} +contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5)) +contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5)) +contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5)) +contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3)) +contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3)) + + , + |\ + | \ + +----, + | | + | | + | |\ + | | \ + +----+--` + ^ n2 ^n1^ + + For an actual implementation, the assumption that M(n)=n^e is + incorrect, as a consequence also the assumption that ML(n)=k*M(n) + with a constant k is wrong. + + But theory suggest us two things: + - the best the multiplication product is (lower e), the more k + approaches 1, and a approaches 0. + + - A value for a smaller than optimal is probably less bad than a + bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal + value, and k(a)=0.808_ the mul/mullo speed ratio. We get + k(a+1/6)=0.929_ but k(a-1/6)=0.865_. +*/ + +static mp_size_t +mpn_sqrlo_itch (mp_size_t n) +{ + return 2*n; +} + +/* + mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp. + It accepts tp == rp. +*/ +static void +mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp) +{ + mp_size_t n2, n1; + ASSERT (n >= 2); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); + + /* Divide-and-conquer */ + + /* We need fractional approximation of the value 0 < a <= 1/2 + giving the minimum in the function k=(1-a)^e/(1-2*a^e). + */ + if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11))) + n1 = n >> 1; + else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11))) + n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ + else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9))) + n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ + else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9)) + n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ + /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ + else + n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ + + n2 = n - n1; + + /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */ + + /* x0 ^ 2 */ + mpn_sqr (tp, xp, n2); + MPN_COPY (rp, tp, n2); + + /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp + n2, xp, n1); + else + mpn_mullo_n (tp + n, xp + n2, xp, n1); + /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */ +#if HAVE_NATIVE_mpn_addlsh1_n + mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1); +#else + mpn_lshift (rp + n2, tp + n, n1, 1); + mpn_add_n (rp + n2, rp + n2, tp + n2, n1); +#endif +} + +/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ +#define SQR_BASECASE_ALLOC \ + (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT) + +/* FIXME: This function should accept a temporary area; dc_sqrlo + accepts a pointer tp, and handle the case tp == rp, do the same here. +*/ + +void +mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + + if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t tp[SQR_BASECASE_ALLOC]; + mpn_sqr_basecase (tp, xp, n); + MPN_COPY (rp, tp, n); + } + else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD)) + { + mpn_sqrlo_basecase (rp, xp, n); + } + else + { + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n)); + if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD)) + { + mpn_dc_sqrlo (rp, xp, n, tp); + } + else + { + /* For really large operands, use plain mpn_mul_n but throw away upper n + limbs of result. */ +#if !TUNE_PROGRAM_BUILD && (SQRLO_MUL_N_THRESHOLD > SQR_FFT_THRESHOLD) + mpn_fft_mul (tp, xp, n, xp, n); +#else + mpn_sqr (tp, xp, n); +#endif + MPN_COPY (rp, tp, n); + } + TMP_FREE; + } +} diff --git a/mpn/generic/sqrlo_basecase.c b/mpn/generic/sqrlo_basecase.c new file mode 100644 index 000000000..a52744158 --- /dev/null +++ b/mpn/generic/sqrlo_basecase.c @@ -0,0 +1,108 @@ +/* mpn_sqrlo_basecase -- Internal routine to square a natural number + of length n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2015 +Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +#if HAVE_NATIVE_mpn_sqr_diagonal +#define MPN_SQR_DIAGONAL(rp, up, n) \ + mpn_sqr_diagonal (rp, up, n) +#else +#define MPN_SQR_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t _i; \ + for (_i = 0; _i < (n); _i++) \ + { \ + mp_limb_t ul, lpl; \ + ul = (up)[_i]; \ + umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS); \ + (rp)[2 * _i] = lpl >> GMP_NAIL_BITS; \ + } \ + } while (0) +#endif + +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 +#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + MPN_SQR_DIAGONAL (rp, up, n>>1); \ + if ((n & 1) != 0) \ + (rp)[n - 1] = ((up)[n>>1] * (up)[n>>1]) & GMP_NUMB_MASK; \ + mpn_addlsh1_n_ip1 (rp + 1, tp, n - 1); \ + } while (0) +#else +#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + MPN_SQR_DIAGONAL (rp, up, n>>1); \ + if ((n & 1) != 0) \ + (rp)[n - 1] = ((up)[n>>1] * (up)[n>>1]) & GMP_NUMB_MASK; \ + mpn_lshift (tp, tp, n - 1, 1); \ + mpn_add_n (rp + 1, rp + 1, tp, n - 1); \ + } while (0) +#endif + + +/* Default mpn_sqrlo_basecase using mpn_addmul_1. */ + +void +mpn_sqrlo_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, up, n)); + + if (n <= 1) + { + mp_limb_t ul; + ul = up[0]; + rp[0] = (ul * ul) & GMP_NUMB_MASK; + } + else + { + mp_limb_t tp[2 * SQR_TOOM2_THRESHOLD - 1]; + + /* must fit n-1 limbs in tp */ + ASSERT (n <= 2 * SQR_TOOM2_THRESHOLD); + + mpn_mul_1 (tp, up + 1, n - 1, up[0]); + for (i = 2; 2 * i - 1 < n; ++i) + mpn_addmul_1 (tp + 2 * i - 2, up + i, n - 2 * i + 1, up[i - 1]); + + MPN_SQRLO_DIAG_ADDLSH1 (rp, tp, up, n); + } +} |