summaryrefslogtreecommitdiff
path: root/gen-psqr.c
diff options
context:
space:
mode:
authorKevin Ryde <user42@zip.com.au>2002-10-03 02:15:05 +0200
committerKevin Ryde <user42@zip.com.au>2002-10-03 02:15:05 +0200
commit0e418e12617bd40f60e67f94e87c13fe887bbf22 (patch)
tree9240d436dd9938467ba057755cf2cef648a2596b /gen-psqr.c
parent4879c014c18f8865643a15e96534eedf5cd6255d (diff)
downloadgmp-0e418e12617bd40f60e67f94e87c13fe887bbf22.tar.gz
* gen-psqr.c: New file.
Diffstat (limited to 'gen-psqr.c')
-rw-r--r--gen-psqr.c550
1 files changed, 550 insertions, 0 deletions
diff --git a/gen-psqr.c b/gen-psqr.c
new file mode 100644
index 000000000..ced075074
--- /dev/null
+++ b/gen-psqr.c
@@ -0,0 +1,550 @@
+/* Generate perfect square testing data.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dumbmp.c"
+
+
+/* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
+ (plus a PERFSQR_PP modulus), and generate tables indicating residues and
+ non-residues modulo small factors of that modulus.
+
+ For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used. That
+ function exists specifically because 2^24-1 and 2^48-1 have nice sets of
+ prime factors. For other limb sizes it's considered, but if it doesn't
+ have good factors then mpn_mod_1 will be used instead.
+
+ When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
+ selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
+ with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
+ GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
+ calculation within a single limb.
+
+ In either case primes can be combined to make divisors. The table data
+ then effectively indicates remainders which are quadratic residues mod
+ all the primes. This sort of combining reduces the number of steps
+ needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
+ Nothing is gained or lost in terms of detections, the same total fraction
+ of non-residues will be identified.
+
+ Nothing particularly sophisticated is attempted for combining factors to
+ make divisors. This is probably a kind of knapsack problem so it'd be
+ too hard to attempt anything completely general. For the usual 32 and 64
+ bit limbs we get a good enough result just pairing the biggest and
+ smallest which fit together, repeatedly.
+
+ Another aim is to get powerful combinations, ie. divisors which identify
+ biggest fraction of non-residues, and have those run first. Again for
+ the usual 32 and 64 bits it seems good enough just to pair for big
+ divisors then sort according to the resulting fraction of non-residues
+ identified.
+
+ Also in this program, a table sq_res_0x100 of residues modulo 256 is
+ generated. This simply fills bits into limbs of the appropriate
+ build-time GMP_LIMB_BITS each.
+
+*/
+
+mpz_t *sq_res_0x100; /* table of limbs */
+int nsq_res_0x100; /* elements in sq_res_0x100 array */
+int sq_res_0x100_num; /* squares in sq_res_0x100 */
+double sq_res_0x100_fraction; /* sq_res_0x100_num / 256 */
+
+int mod34_bits; /* 3*GMP_NUMB_BITS/4 */
+int mod_bits; /* bits from PERFSQR_MOD_34 or MOD_PP */
+int max_divisor; /* all divisors <= max_divisor */
+int max_divisor_bits; /* ceil(log2(max_divisor)) */
+double total_fraction; /* of squares */
+mpz_t pp; /* product of primes, or 0 if mod_34lsub1 used */
+mpz_t pp_norm; /* pp shifted so NUMB high bit set */
+mpz_t pp_inverted; /* invert_limb style inverse */
+mpz_t mod_mask; /* 2^mod_bits-1 */
+char mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
+
+/* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
+struct rawfactor_t {
+ int divisor;
+ int multiplicity;
+};
+struct rawfactor_t *rawfactor;
+int nrawfactor;
+
+/* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
+struct factor_t {
+ int divisor;
+ mpz_t inverse; /* 1/divisor mod 2^mod_bits */
+ mpz_t mask; /* indicating squares mod divisor */
+ double fraction; /* squares/total */
+};
+struct factor_t *factor;
+int nfactor; /* entries in use in factor array */
+int factor_alloc; /* entries allocated to factor array */
+
+
+int
+f_cmp_divisor (struct factor_t *p, struct factor_t *q)
+{
+ if (p->divisor > q->divisor)
+ return 1;
+ else if (p->divisor < q->divisor)
+ return -1;
+ else
+ return 0;
+}
+
+int
+f_cmp_fraction (struct factor_t *p, struct factor_t *q)
+{
+ if (p->fraction > q->fraction)
+ return 1;
+ else if (p->fraction < q->fraction)
+ return -1;
+ else
+ return 0;
+}
+
+/* Remove array[idx] by copying the remainder down, and adjust narray
+ accordingly. */
+#define COLLAPSE_ELEMENT(array, idx, narray) \
+ do { \
+ mem_copyi ((char *) &(array)[idx], \
+ (char *) &(array)[idx+1], \
+ ((narray)-((idx)+1)) * sizeof (array[0])); \
+ (narray)--; \
+ } while (0)
+
+
+/* return n*2^p mod m */
+int
+mul_2exp_mod (int n, int p, int m)
+{
+ int i;
+ for (i = 0; i < p; i++)
+ n = (2 * n) % m;
+ return n;
+}
+
+/* return -n mod m */
+int
+neg_mod (int n, int m)
+{
+ ASSERT (n >= 0 && n < m);
+ return (n == 0 ? 0 : m-n);
+}
+
+/* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
+ "-(idx<<mod_bits)" can be a square modulo m. */
+void
+square_mask (mpz_t mask, int m)
+{
+ int p, i, r, idx;
+
+ p = mul_2exp_mod (1, mod_bits, m);
+ p = neg_mod (p, m);
+
+ mpz_set_ui (mask, 0L);
+ for (i = 0; i < m; i++)
+ {
+ r = (i * i) % m;
+ idx = (r * p) % m;
+ mpz_setbit (mask, (unsigned long) idx);
+ }
+}
+
+void
+generate_sq_res_0x100 (int limb_bits)
+{
+ int i, res;
+
+ nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
+ sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
+
+ for (i = 0; i < nsq_res_0x100; i++)
+ mpz_init_set_ui (sq_res_0x100[i], 0L);
+
+ for (i = 0; i < 0x100; i++)
+ {
+ res = (i * i) % 0x100;
+ mpz_setbit (sq_res_0x100[res / limb_bits],
+ (unsigned long) (res % limb_bits));
+ }
+
+ sq_res_0x100_num = 0;
+ for (i = 0; i < nsq_res_0x100; i++)
+ sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
+ sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
+}
+
+void
+generate_mod (int limb_bits, int nail_bits)
+{
+ int numb_bits = limb_bits - nail_bits;
+ int i, divisor;
+
+ mpz_init_set_ui (pp, 0L);
+ mpz_init_set_ui (pp_norm, 0L);
+ mpz_init_set_ui (pp_inverted, 0L);
+
+ /* no more than limb_bits many factors in a one limb modulus (and of
+ course in reality nothing like that many) */
+ factor_alloc = limb_bits;
+ factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
+ rawfactor = (struct rawfactor_t *)
+ xmalloc (factor_alloc * sizeof (*rawfactor));
+
+ if (numb_bits % 4 != 0)
+ {
+ strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
+ goto use_pp;
+ }
+
+ max_divisor = 2*limb_bits;
+ max_divisor_bits = log2_ceil (max_divisor);
+
+ if (numb_bits / 4 < max_divisor_bits)
+ {
+ /* Wind back to one limb worth of max_divisor, if that will let us use
+ mpn_mod_34lsub1. */
+ max_divisor = limb_bits;
+ max_divisor_bits = log2_ceil (max_divisor);
+
+ if (numb_bits / 4 < max_divisor_bits)
+ {
+ strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
+ goto use_pp;
+ }
+ }
+
+ {
+ /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
+ mpz_t m, q, r;
+ int multiplicity;
+
+ mod34_bits = (numb_bits / 4) * 3;
+
+ /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
+ the mod34_bits mark, adding the two halves for a remainder of at most
+ mod34_bits+1 many bits */
+ mod_bits = mod34_bits + 1;
+
+ mpz_init_set_ui (m, 1L);
+ mpz_mul_2exp (m, m, mod34_bits);
+ mpz_sub_ui (m, m, 1L);
+
+ mpz_init (q);
+ mpz_init (r);
+
+ for (i = 3; i <= max_divisor; i++)
+ {
+ if (! isprime (i))
+ continue;
+
+ mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
+ if (mpz_sgn (r) != 0)
+ continue;
+
+ /* if a repeated prime is found it's used as an i^n in one factor */
+ divisor = 1;
+ multiplicity = 0;
+ do
+ {
+ if (divisor > max_divisor / i)
+ break;
+ multiplicity++;
+ mpz_set (m, q);
+ mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
+ }
+ while (mpz_sgn (r) == 0);
+
+ ASSERT (nrawfactor < factor_alloc);
+ rawfactor[nrawfactor].divisor = i;
+ rawfactor[nrawfactor].multiplicity = multiplicity;
+ nrawfactor++;
+ }
+
+ mpz_clear (m);
+ mpz_clear (q);
+ mpz_clear (r);
+ }
+
+ if (nrawfactor <= 2)
+ {
+ mpz_t new_pp;
+
+ sprintf (mod34_excuse, "only %d small factor%s",
+ nrawfactor, nrawfactor == 1 ? "" : "s");
+
+ use_pp:
+ /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
+ tried with just one */
+ max_divisor = 2*limb_bits;
+ max_divisor_bits = log2_ceil (max_divisor);
+
+ mpz_init (new_pp);
+ nrawfactor = 0;
+ mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
+
+ /* one copy of each small prime */
+ mpz_set_ui (pp, 1L);
+ for (i = 3; i <= max_divisor; i++)
+ {
+ if (! isprime (i))
+ continue;
+
+ mpz_mul_ui (new_pp, pp, (unsigned long) i);
+ if (mpz_sizeinbase (new_pp, 2) > mod_bits)
+ break;
+ mpz_set (pp, new_pp);
+
+ ASSERT (nrawfactor < factor_alloc);
+ rawfactor[nrawfactor].divisor = i;
+ rawfactor[nrawfactor].multiplicity = 1;
+ nrawfactor++;
+ }
+
+ /* Plus an extra copy of one or more of the primes selected, if that
+ still fits in max_divisor and the total in mod_bits. Usually only
+ 3 or 5 will be candidates */
+ for (i = nrawfactor-1; i >= 0; i--)
+ {
+ if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
+ continue;
+ mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
+ if (mpz_sizeinbase (new_pp, 2) > mod_bits)
+ continue;
+ mpz_set (pp, new_pp);
+
+ rawfactor[i].multiplicity++;
+ }
+
+ mod_bits = mpz_sizeinbase (pp, 2);
+
+ mpz_set (pp_norm, pp);
+ while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
+ mpz_add (pp_norm, pp_norm, pp_norm);
+
+ mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
+
+ mpz_clear (new_pp);
+ }
+
+ /* start the factor array */
+ for (i = 0; i < nrawfactor; i++)
+ {
+ int j;
+ ASSERT (nfactor < factor_alloc);
+ factor[nfactor].divisor = 1;
+ for (j = 0; j < rawfactor[i].multiplicity; j++)
+ factor[nfactor].divisor *= rawfactor[i].divisor;
+ nfactor++;
+ }
+
+ combine:
+ /* Combine entries in the factor array. Combine the smallest entry with
+ the biggest one that will fit with it (ie. under max_divisor), then
+ repeat that with the new smallest entry. */
+ qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
+ for (i = nfactor-1; i >= 1; i--)
+ {
+ if (factor[i].divisor <= max_divisor / factor[0].divisor)
+ {
+ factor[0].divisor *= factor[i].divisor;
+ COLLAPSE_ELEMENT (factor, i, nfactor);
+ goto combine;
+ }
+ }
+
+ total_fraction = 1.0;
+ for (i = 0; i < nfactor; i++)
+ {
+ mpz_init (factor[i].inverse);
+ mpz_invert_ui_2exp (factor[i].inverse, factor[i].divisor, mod_bits);
+
+ mpz_init (factor[i].mask);
+ square_mask (factor[i].mask, factor[i].divisor);
+
+ /* fraction of possible squares */
+ factor[i].fraction = (double) mpz_popcount (factor[i].mask)
+ / factor[i].divisor;
+
+ /* total fraction of possible squares */
+ total_fraction *= factor[i].fraction;
+ }
+
+ /* best tests first (ie. smallest fraction) */
+ qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
+}
+
+void
+print (int limb_bits, int nail_bits)
+{
+ int i;
+ mpz_t mhi, mlo;
+
+ printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
+ printf ("\n");
+
+ printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
+ limb_bits, nail_bits);
+ printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
+ limb_bits, nail_bits);
+ printf ("#endif\n");
+ printf ("\n");
+
+ printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
+ printf (" This test identifies %.2f%% as non-squares (%d/256). */\n",
+ (1.0 - sq_res_0x100_fraction) * 100.0,
+ 0x100 - sq_res_0x100_num);
+ printf ("static const mp_limb_t\n");
+ printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
+ for (i = 0; i < nsq_res_0x100; i++)
+ {
+ printf (" CNST_LIMB(0x");
+ mpz_out_str (0, 16, sq_res_0x100[i]);
+ printf ("),\n");
+ }
+ printf ("};\n");
+ printf ("\n");
+
+ if (mpz_sgn (pp) != 0)
+ {
+ printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
+ printf ("/* PERFSQR_PP = ");
+ }
+ else
+ printf ("/* 2^%d-1 = ", mod34_bits);
+ for (i = 0; i < nrawfactor; i++)
+ {
+ if (i != 0)
+ printf (" * ");
+ printf ("%d", rawfactor[i].divisor);
+ if (rawfactor[i].multiplicity != 1)
+ printf ("^%d", rawfactor[i].multiplicity);
+ }
+ printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
+
+ printf ("#define PERFSQR_MOD_BITS %d\n", mod_bits);
+ if (mpz_sgn (pp) != 0)
+ {
+ printf ("#define PERFSQR_PP CNST_LIMB(0x");
+ mpz_out_str (0, 16, pp);
+ printf (")\n");
+ printf ("#define PERFSQR_PP_NORM CNST_LIMB(0x");
+ mpz_out_str (0, 16, pp_norm);
+ printf (")\n");
+ printf ("#define PERFSQR_PP_INVERTED CNST_LIMB(0x");
+ mpz_out_str (0, 16, pp_inverted);
+ printf (")\n");
+ }
+ printf ("\n");
+
+ mpz_init (mhi);
+ mpz_init (mlo);
+
+ printf ("/* This test identifies %.2f%% as non-squares. */\n",
+ (1.0 - total_fraction) * 100.0);
+ printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
+ printf (" do { \\\n");
+ printf (" mp_limb_t r; \\\n");
+ if (mpz_sgn (pp) != 0)
+ printf (" PERFSQR_MOD_PP (r, up, usize); \\\n");
+ else
+ printf (" PERFSQR_MOD_34 (r, up, usize); \\\n");
+
+ for (i = 0; i < nfactor; i++)
+ {
+ printf (" \\\n");
+ printf (" /* %5.2f%% */ \\\n",
+ (1.0 - factor[i].fraction) * 100.0);
+
+ printf (" PERFSQR_MOD_%d (r, %2d, CNST_LIMB(0x",
+ factor[i].divisor <= limb_bits ? 1 : 2,
+ factor[i].divisor);
+ mpz_out_str (0, 16, factor[i].inverse);
+ printf ("), \\\n");
+ printf (" CNST_LIMB(0x");
+
+ if ( factor[i].divisor <= limb_bits)
+ {
+ mpz_out_str (0, 16, factor[i].mask);
+ }
+ else
+ {
+ mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
+ mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
+ mpz_out_str (0, 16, mhi);
+ printf ("), CNST_LIMB(0x");
+ mpz_out_str (0, 16, mlo);
+ }
+ printf (")); \\\n");
+ }
+
+ printf (" } while (0)\n");
+ printf ("\n");
+
+ printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
+ (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
+ printf ("\n");
+
+ printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
+ printf ("#define PERFSQR_DIVISORS { 256,");
+ for (i = 0; i < nfactor; i++)
+ printf (" %d,", factor[i].divisor);
+ printf (" }\n");
+
+
+ mpz_clear (mhi);
+ mpz_clear (mlo);
+}
+
+int
+main (int argc, char *argv[])
+{
+ int limb_bits, nail_bits, numb_bits;
+
+ if (argc != 3)
+ {
+ fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
+ exit (1);
+ }
+
+ limb_bits = atoi (argv[1]);
+ nail_bits = atoi (argv[2]);
+
+ if (limb_bits <= 0
+ || nail_bits < 0
+ || nail_bits >= limb_bits)
+ {
+ fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
+ limb_bits, nail_bits);
+ exit (1);
+ }
+ numb_bits = limb_bits - nail_bits;
+
+ generate_sq_res_0x100 (limb_bits);
+ generate_mod (limb_bits, nail_bits);
+
+ print (limb_bits, nail_bits);
+
+ return 0;
+}