mpn/generic/sqrlo{,_basecase.c}: New files.

author: Marco Bodrato <bodrato@mail.dm.unipi.it> 2015-08-06 04:14:32 +0200
committer: Marco Bodrato <bodrato@mail.dm.unipi.it> 2015-08-06 04:14:32 +0200
commit: 2ea7d2279f7c6d77b1d4ccee427a75c8f61a132e (patch)
tree: 5f7029c4d926a02062b80e02c18151848db78159
parent: 1ecd3e90d57e06293843327cae934245afcc873b (diff)
download: gmp-2ea7d2279f7c6d77b1d4ccee427a75c8f61a132e.tar.gz
4 files changed, 361 insertions, 1 deletions
diff --git a/configure.ac b/configure.ac
index 7c8b3682c..788542ca7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2860,7 +2860,7 @@ gmp_mpn_functions="$extra_functions					   \
   matrix22_mul matrix22_mul1_inverse_vector				   \
   hgcd_matrix hgcd2 hgcd_step hgcd_reduce hgcd hgcd_appr		   \
   hgcd2_jacobi hgcd_jacobi						   \
-  mullo_n mullo_basecase						   \
+  mullo_n mullo_basecase sqrlo sqrlo_basecase				   \
   toom22_mul toom32_mul toom42_mul toom52_mul toom62_mul		   \
   toom33_mul toom43_mul toom53_mul toom54_mul toom63_mul		   \
   toom44_mul								   \
diff --git a/gmp-impl.h b/gmp-impl.h
index b5f8b05fb..7f807337a 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -1084,6 +1084,12 @@ __GMP_DECLSPEC void mpn_mullo_basecase (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)
 __GMP_DECLSPEC void mpn_sqr_basecase (mp_ptr, mp_srcptr, mp_size_t);
 #endif
 
+#define mpn_sqrlo __MPN(sqrlo)
+__GMP_DECLSPEC void mpn_sqrlo (mp_ptr, mp_srcptr, mp_size_t);
+
+#define mpn_sqrlo_basecase __MPN(sqrlo_basecase)
+__GMP_DECLSPEC void mpn_sqrlo_basecase (mp_ptr, mp_srcptr, mp_size_t);
+
 #define mpn_mulmid_basecase __MPN(mulmid_basecase)
 __GMP_DECLSPEC void mpn_mulmid_basecase (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t);
 
diff --git a/mpn/generic/sqrlo.c b/mpn/generic/sqrlo.c
new file mode 100644
index 000000000..ffc2ef722
--- /dev/null
+++ b/mpn/generic/sqrlo.c
@@ -0,0 +1,246 @@
+/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
+   of the result.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
+   FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef SQRLO_BASECASE_THRESHOLD_LIMIT
+#define SQRLO_BASECASE_THRESHOLD_LIMIT	200
+#endif
+#ifndef SQRLO_BASECASE_THRESHOLD
+#define SQRLO_BASECASE_THRESHOLD	0
+#endif
+#ifndef SQRLO_DC_THRESHOLD
+#define SQRLO_DC_THRESHOLD		(2*SQR_TOOM2_THRESHOLD)
+#endif
+#ifndef SQRLO_SQR_THRESHOLD
+#define SQRLO_SQR_THRESHOLD		(2*SQR_FFT_THRESHOLD)
+#endif
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_range_basecase 1
+#define MAYBE_range_toom22   1
+#else
+#define MAYBE_range_basecase                                           \
+  ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
+#define MAYBE_range_toom22                                             \
+  ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
+#endif
+
+/*  THINK: The DC strategy uses different constants in different Toom's
+	 ranges. Something smoother?
+*/
+
+/*
+  Compute the least significant half of the product {xy,n}*{yp,n}, or
+  formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
+
+  Above the given threshold, the Divide and Conquer strategy is used.
+  The operand is split in two, and a full square plus a mullo
+  is used to obtain the final result. The more natural strategy is to
+  split in two halves, but this is far from optimal when a
+  sub-quadratic multiplication is used.
+
+  Mulders suggests an unbalanced split in favour of the full product,
+  split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
+
+  To compute the value of a, we assume that the cost of mullo for a
+  given size ML(n) is a fraction of the cost of a full product with
+  same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
+  then we can write:
+
+  ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
+
+  Given a value for e, want to minimise the value of k, i.e. the
+  function k=(1-a)^e/(1-2*a^e).
+
+  With e=2, the exponent for schoolbook multiplication, the minimum is
+  given by the values a=1-a=1/2.
+
+  With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
+  Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
+
+  Other possible approximations follow:
+  e=log(5)/log(3) [Toom-3] -> a ~= 9/40
+  e=log(7)/log(4) [Toom-4] -> a ~= 7/39
+  e=log(11)/log(6) [Toom-6] -> a ~= 1/8
+  e=log(15)/log(8) [Toom-8] -> a ~= 1/10
+
+  The values above where obtained with the following trivial commands
+  in the gp-pari shell:
+
+fun(e,a)=(1-a)^e/(1-2*a^e)
+mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
+contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
+contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
+contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
+contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
+contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
+
+  ,
+  |\
+  | \
+  +----,
+  |    |
+  |    |
+  |    |\
+  |    | \
+  +----+--`
+  ^ n2 ^n1^
+
+  For an actual implementation, the assumption that M(n)=n^e is
+  incorrect, as a consequence also the assumption that ML(n)=k*M(n)
+  with a constant k is wrong.
+
+  But theory suggest us two things:
+  - the best the multiplication product is (lower e), the more k
+    approaches 1, and a approaches 0.
+
+  - A value for a smaller than optimal is probably less bad than a
+    bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
+    value, and k(a)=0.808_ the mul/mullo speed ratio. We get
+    k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
+*/
+
+static mp_size_t
+mpn_sqrlo_itch (mp_size_t n)
+{
+  return 2*n;
+}
+
+/*
+    mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
+    It accepts tp == rp.
+*/
+static void
+mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
+{
+  mp_size_t n2, n1;
+  ASSERT (n >= 2);
+  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
+  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
+
+  /* Divide-and-conquer */
+
+  /* We need fractional approximation of the value 0 < a <= 1/2
+     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
+  */
+  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
+    n1 = n >> 1;
+  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
+    n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
+  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
+    n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
+  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
+    n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
+  /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
+  else
+    n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
+
+  n2 = n - n1;
+
+  /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
+
+  /* x0 ^ 2 */
+  mpn_sqr (tp, xp, n2);
+  MPN_COPY (rp, tp, n2);
+
+  /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
+  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
+    mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
+  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
+    mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
+  else
+    mpn_mullo_n (tp + n, xp + n2, xp, n1);
+  /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
+#else
+  mpn_lshift (rp + n2, tp + n, n1, 1);
+  mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
+#endif
+}
+
+/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
+#define SQR_BASECASE_ALLOC \
+ (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
+
+/* FIXME: This function should accept a temporary area; dc_sqrlo
+   accepts a pointer tp, and handle the case tp == rp, do the same here.
+*/
+
+void
+mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
+{
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
+
+  if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+      mp_limb_t tp[SQR_BASECASE_ALLOC];
+      mpn_sqr_basecase (tp, xp, n);
+      MPN_COPY (rp, tp, n);
+    }
+  else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
+    {
+      mpn_sqrlo_basecase (rp, xp, n);
+    }
+  else
+    {
+      mp_ptr tp;
+      TMP_DECL;
+      TMP_MARK;
+      tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
+      if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
+	{
+	  mpn_dc_sqrlo (rp, xp, n, tp);
+	}
+      else
+	{
+	  /* For really large operands, use plain mpn_mul_n but throw away upper n
+	     limbs of result.  */
+#if !TUNE_PROGRAM_BUILD && (SQRLO_MUL_N_THRESHOLD > SQR_FFT_THRESHOLD)
+	  mpn_fft_mul (tp, xp, n, xp, n);
+#else
+	  mpn_sqr (tp, xp, n);
+#endif
+	  MPN_COPY (rp, tp, n);
+	}
+      TMP_FREE;
+    }
+}
diff --git a/mpn/generic/sqrlo_basecase.c b/mpn/generic/sqrlo_basecase.c
new file mode 100644
index 000000000..a52744158
--- /dev/null
+++ b/mpn/generic/sqrlo_basecase.c
@@ -0,0 +1,108 @@
+/* mpn_sqrlo_basecase -- Internal routine to square a natural number
+   of length n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2015
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if HAVE_NATIVE_mpn_sqr_diagonal
+#define MPN_SQR_DIAGONAL(rp, up, n)					\
+  mpn_sqr_diagonal (rp, up, n)
+#else
+#define MPN_SQR_DIAGONAL(rp, up, n)					\
+  do {									\
+    mp_size_t _i;							\
+    for (_i = 0; _i < (n); _i++)					\
+      {									\
+	mp_limb_t ul, lpl;						\
+	ul = (up)[_i];							\
+	umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS);	\
+	(rp)[2 * _i] = lpl >> GMP_NAIL_BITS;				\
+      }									\
+  } while (0)
+#endif
+
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    MPN_SQR_DIAGONAL (rp, up, n>>1);					\
+    if ((n & 1) != 0)							\
+      (rp)[n - 1] = ((up)[n>>1] * (up)[n>>1]) & GMP_NUMB_MASK;		\
+    mpn_addlsh1_n_ip1 (rp + 1, tp, n - 1);				\
+  } while (0)
+#else
+#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    MPN_SQR_DIAGONAL (rp, up, n>>1);					\
+    if ((n & 1) != 0)							\
+      (rp)[n - 1] = ((up)[n>>1] * (up)[n>>1]) & GMP_NUMB_MASK;		\
+    mpn_lshift (tp, tp, n - 1, 1);					\
+    mpn_add_n (rp + 1, rp + 1, tp, n - 1);				\
+  } while (0)
+#endif
+
+
+/* Default mpn_sqrlo_basecase using mpn_addmul_1.  */
+
+void
+mpn_sqrlo_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_size_t i;
+
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, n, up, n));
+
+  if (n <= 1)
+    {
+      mp_limb_t ul;
+      ul = up[0];
+      rp[0] = (ul * ul) & GMP_NUMB_MASK;
+    }
+  else
+    {
+      mp_limb_t tp[2 * SQR_TOOM2_THRESHOLD - 1];
+
+      /* must fit n-1 limbs in tp */
+      ASSERT (n <= 2 * SQR_TOOM2_THRESHOLD);
+
+      mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+      for (i = 2; 2 * i - 1 < n; ++i)
+	mpn_addmul_1 (tp + 2 * i - 2, up + i, n - 2 * i + 1, up[i - 1]);
+
+      MPN_SQRLO_DIAG_ADDLSH1 (rp, tp, up, n);
+    }
+}
author	Marco Bodrato <bodrato@mail.dm.unipi.it>	2015-08-06 04:14:32 +0200
committer	Marco Bodrato <bodrato@mail.dm.unipi.it>	2015-08-06 04:14:32 +0200
commit	2ea7d2279f7c6d77b1d4ccee427a75c8f61a132e (patch)
tree	5f7029c4d926a02062b80e02c18151848db78159
parent	1ecd3e90d57e06293843327cae934245afcc873b (diff)
download	gmp-2ea7d2279f7c6d77b1d4ccee427a75c8f61a132e.tar.gz