diff options
-rw-r--r-- | gen-psqr.c | 550 |
1 files changed, 550 insertions, 0 deletions
diff --git a/gen-psqr.c b/gen-psqr.c new file mode 100644 index 000000000..ced075074 --- /dev/null +++ b/gen-psqr.c @@ -0,0 +1,550 @@ +/* Generate perfect square testing data. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include <stdio.h> +#include <stdlib.h> + +#include "dumbmp.c" + + +/* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1 + (plus a PERFSQR_PP modulus), and generate tables indicating residues and + non-residues modulo small factors of that modulus. + + For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used. That + function exists specifically because 2^24-1 and 2^48-1 have nice sets of + prime factors. For other limb sizes it's considered, but if it doesn't + have good factors then mpn_mod_1 will be used instead. + + When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a + selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb, + with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <= + GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its + calculation within a single limb. + + In either case primes can be combined to make divisors. The table data + then effectively indicates remainders which are quadratic residues mod + all the primes. This sort of combining reduces the number of steps + needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time. + Nothing is gained or lost in terms of detections, the same total fraction + of non-residues will be identified. + + Nothing particularly sophisticated is attempted for combining factors to + make divisors. This is probably a kind of knapsack problem so it'd be + too hard to attempt anything completely general. For the usual 32 and 64 + bit limbs we get a good enough result just pairing the biggest and + smallest which fit together, repeatedly. + + Another aim is to get powerful combinations, ie. divisors which identify + biggest fraction of non-residues, and have those run first. Again for + the usual 32 and 64 bits it seems good enough just to pair for big + divisors then sort according to the resulting fraction of non-residues + identified. + + Also in this program, a table sq_res_0x100 of residues modulo 256 is + generated. This simply fills bits into limbs of the appropriate + build-time GMP_LIMB_BITS each. + +*/ + +mpz_t *sq_res_0x100; /* table of limbs */ +int nsq_res_0x100; /* elements in sq_res_0x100 array */ +int sq_res_0x100_num; /* squares in sq_res_0x100 */ +double sq_res_0x100_fraction; /* sq_res_0x100_num / 256 */ + +int mod34_bits; /* 3*GMP_NUMB_BITS/4 */ +int mod_bits; /* bits from PERFSQR_MOD_34 or MOD_PP */ +int max_divisor; /* all divisors <= max_divisor */ +int max_divisor_bits; /* ceil(log2(max_divisor)) */ +double total_fraction; /* of squares */ +mpz_t pp; /* product of primes, or 0 if mod_34lsub1 used */ +mpz_t pp_norm; /* pp shifted so NUMB high bit set */ +mpz_t pp_inverted; /* invert_limb style inverse */ +mpz_t mod_mask; /* 2^mod_bits-1 */ +char mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */ + +/* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */ +struct rawfactor_t { + int divisor; + int multiplicity; +}; +struct rawfactor_t *rawfactor; +int nrawfactor; + +/* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */ +struct factor_t { + int divisor; + mpz_t inverse; /* 1/divisor mod 2^mod_bits */ + mpz_t mask; /* indicating squares mod divisor */ + double fraction; /* squares/total */ +}; +struct factor_t *factor; +int nfactor; /* entries in use in factor array */ +int factor_alloc; /* entries allocated to factor array */ + + +int +f_cmp_divisor (struct factor_t *p, struct factor_t *q) +{ + if (p->divisor > q->divisor) + return 1; + else if (p->divisor < q->divisor) + return -1; + else + return 0; +} + +int +f_cmp_fraction (struct factor_t *p, struct factor_t *q) +{ + if (p->fraction > q->fraction) + return 1; + else if (p->fraction < q->fraction) + return -1; + else + return 0; +} + +/* Remove array[idx] by copying the remainder down, and adjust narray + accordingly. */ +#define COLLAPSE_ELEMENT(array, idx, narray) \ + do { \ + mem_copyi ((char *) &(array)[idx], \ + (char *) &(array)[idx+1], \ + ((narray)-((idx)+1)) * sizeof (array[0])); \ + (narray)--; \ + } while (0) + + +/* return n*2^p mod m */ +int +mul_2exp_mod (int n, int p, int m) +{ + int i; + for (i = 0; i < p; i++) + n = (2 * n) % m; + return n; +} + +/* return -n mod m */ +int +neg_mod (int n, int m) +{ + ASSERT (n >= 0 && n < m); + return (n == 0 ? 0 : m-n); +} + +/* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if + "-(idx<<mod_bits)" can be a square modulo m. */ +void +square_mask (mpz_t mask, int m) +{ + int p, i, r, idx; + + p = mul_2exp_mod (1, mod_bits, m); + p = neg_mod (p, m); + + mpz_set_ui (mask, 0L); + for (i = 0; i < m; i++) + { + r = (i * i) % m; + idx = (r * p) % m; + mpz_setbit (mask, (unsigned long) idx); + } +} + +void +generate_sq_res_0x100 (int limb_bits) +{ + int i, res; + + nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits; + sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100)); + + for (i = 0; i < nsq_res_0x100; i++) + mpz_init_set_ui (sq_res_0x100[i], 0L); + + for (i = 0; i < 0x100; i++) + { + res = (i * i) % 0x100; + mpz_setbit (sq_res_0x100[res / limb_bits], + (unsigned long) (res % limb_bits)); + } + + sq_res_0x100_num = 0; + for (i = 0; i < nsq_res_0x100; i++) + sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]); + sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0; +} + +void +generate_mod (int limb_bits, int nail_bits) +{ + int numb_bits = limb_bits - nail_bits; + int i, divisor; + + mpz_init_set_ui (pp, 0L); + mpz_init_set_ui (pp_norm, 0L); + mpz_init_set_ui (pp_inverted, 0L); + + /* no more than limb_bits many factors in a one limb modulus (and of + course in reality nothing like that many) */ + factor_alloc = limb_bits; + factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor)); + rawfactor = (struct rawfactor_t *) + xmalloc (factor_alloc * sizeof (*rawfactor)); + + if (numb_bits % 4 != 0) + { + strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0"); + goto use_pp; + } + + max_divisor = 2*limb_bits; + max_divisor_bits = log2_ceil (max_divisor); + + if (numb_bits / 4 < max_divisor_bits) + { + /* Wind back to one limb worth of max_divisor, if that will let us use + mpn_mod_34lsub1. */ + max_divisor = limb_bits; + max_divisor_bits = log2_ceil (max_divisor); + + if (numb_bits / 4 < max_divisor_bits) + { + strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small"); + goto use_pp; + } + } + + { + /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */ + mpz_t m, q, r; + int multiplicity; + + mod34_bits = (numb_bits / 4) * 3; + + /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at + the mod34_bits mark, adding the two halves for a remainder of at most + mod34_bits+1 many bits */ + mod_bits = mod34_bits + 1; + + mpz_init_set_ui (m, 1L); + mpz_mul_2exp (m, m, mod34_bits); + mpz_sub_ui (m, m, 1L); + + mpz_init (q); + mpz_init (r); + + for (i = 3; i <= max_divisor; i++) + { + if (! isprime (i)) + continue; + + mpz_tdiv_qr_ui (q, r, m, (unsigned long) i); + if (mpz_sgn (r) != 0) + continue; + + /* if a repeated prime is found it's used as an i^n in one factor */ + divisor = 1; + multiplicity = 0; + do + { + if (divisor > max_divisor / i) + break; + multiplicity++; + mpz_set (m, q); + mpz_tdiv_qr_ui (q, r, m, (unsigned long) i); + } + while (mpz_sgn (r) == 0); + + ASSERT (nrawfactor < factor_alloc); + rawfactor[nrawfactor].divisor = i; + rawfactor[nrawfactor].multiplicity = multiplicity; + nrawfactor++; + } + + mpz_clear (m); + mpz_clear (q); + mpz_clear (r); + } + + if (nrawfactor <= 2) + { + mpz_t new_pp; + + sprintf (mod34_excuse, "only %d small factor%s", + nrawfactor, nrawfactor == 1 ? "" : "s"); + + use_pp: + /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code + tried with just one */ + max_divisor = 2*limb_bits; + max_divisor_bits = log2_ceil (max_divisor); + + mpz_init (new_pp); + nrawfactor = 0; + mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits); + + /* one copy of each small prime */ + mpz_set_ui (pp, 1L); + for (i = 3; i <= max_divisor; i++) + { + if (! isprime (i)) + continue; + + mpz_mul_ui (new_pp, pp, (unsigned long) i); + if (mpz_sizeinbase (new_pp, 2) > mod_bits) + break; + mpz_set (pp, new_pp); + + ASSERT (nrawfactor < factor_alloc); + rawfactor[nrawfactor].divisor = i; + rawfactor[nrawfactor].multiplicity = 1; + nrawfactor++; + } + + /* Plus an extra copy of one or more of the primes selected, if that + still fits in max_divisor and the total in mod_bits. Usually only + 3 or 5 will be candidates */ + for (i = nrawfactor-1; i >= 0; i--) + { + if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor) + continue; + mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor); + if (mpz_sizeinbase (new_pp, 2) > mod_bits) + continue; + mpz_set (pp, new_pp); + + rawfactor[i].multiplicity++; + } + + mod_bits = mpz_sizeinbase (pp, 2); + + mpz_set (pp_norm, pp); + while (mpz_sizeinbase (pp_norm, 2) < numb_bits) + mpz_add (pp_norm, pp_norm, pp_norm); + + mpz_preinv_invert (pp_inverted, pp_norm, numb_bits); + + mpz_clear (new_pp); + } + + /* start the factor array */ + for (i = 0; i < nrawfactor; i++) + { + int j; + ASSERT (nfactor < factor_alloc); + factor[nfactor].divisor = 1; + for (j = 0; j < rawfactor[i].multiplicity; j++) + factor[nfactor].divisor *= rawfactor[i].divisor; + nfactor++; + } + + combine: + /* Combine entries in the factor array. Combine the smallest entry with + the biggest one that will fit with it (ie. under max_divisor), then + repeat that with the new smallest entry. */ + qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor); + for (i = nfactor-1; i >= 1; i--) + { + if (factor[i].divisor <= max_divisor / factor[0].divisor) + { + factor[0].divisor *= factor[i].divisor; + COLLAPSE_ELEMENT (factor, i, nfactor); + goto combine; + } + } + + total_fraction = 1.0; + for (i = 0; i < nfactor; i++) + { + mpz_init (factor[i].inverse); + mpz_invert_ui_2exp (factor[i].inverse, factor[i].divisor, mod_bits); + + mpz_init (factor[i].mask); + square_mask (factor[i].mask, factor[i].divisor); + + /* fraction of possible squares */ + factor[i].fraction = (double) mpz_popcount (factor[i].mask) + / factor[i].divisor; + + /* total fraction of possible squares */ + total_fraction *= factor[i].fraction; + } + + /* best tests first (ie. smallest fraction) */ + qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction); +} + +void +print (int limb_bits, int nail_bits) +{ + int i; + mpz_t mhi, mlo; + + printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n"); + printf ("\n"); + + printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n", + limb_bits, nail_bits); + printf ("Error, error, this data is for %d bit limb and %d bit nail\n", + limb_bits, nail_bits); + printf ("#endif\n"); + printf ("\n"); + + printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n"); + printf (" This test identifies %.2f%% as non-squares (%d/256). */\n", + (1.0 - sq_res_0x100_fraction) * 100.0, + 0x100 - sq_res_0x100_num); + printf ("static const mp_limb_t\n"); + printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100); + for (i = 0; i < nsq_res_0x100; i++) + { + printf (" CNST_LIMB(0x"); + mpz_out_str (0, 16, sq_res_0x100[i]); + printf ("),\n"); + } + printf ("};\n"); + printf ("\n"); + + if (mpz_sgn (pp) != 0) + { + printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse); + printf ("/* PERFSQR_PP = "); + } + else + printf ("/* 2^%d-1 = ", mod34_bits); + for (i = 0; i < nrawfactor; i++) + { + if (i != 0) + printf (" * "); + printf ("%d", rawfactor[i].divisor); + if (rawfactor[i].multiplicity != 1) + printf ("^%d", rawfactor[i].multiplicity); + } + printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : ""); + + printf ("#define PERFSQR_MOD_BITS %d\n", mod_bits); + if (mpz_sgn (pp) != 0) + { + printf ("#define PERFSQR_PP CNST_LIMB(0x"); + mpz_out_str (0, 16, pp); + printf (")\n"); + printf ("#define PERFSQR_PP_NORM CNST_LIMB(0x"); + mpz_out_str (0, 16, pp_norm); + printf (")\n"); + printf ("#define PERFSQR_PP_INVERTED CNST_LIMB(0x"); + mpz_out_str (0, 16, pp_inverted); + printf (")\n"); + } + printf ("\n"); + + mpz_init (mhi); + mpz_init (mlo); + + printf ("/* This test identifies %.2f%% as non-squares. */\n", + (1.0 - total_fraction) * 100.0); + printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n"); + printf (" do { \\\n"); + printf (" mp_limb_t r; \\\n"); + if (mpz_sgn (pp) != 0) + printf (" PERFSQR_MOD_PP (r, up, usize); \\\n"); + else + printf (" PERFSQR_MOD_34 (r, up, usize); \\\n"); + + for (i = 0; i < nfactor; i++) + { + printf (" \\\n"); + printf (" /* %5.2f%% */ \\\n", + (1.0 - factor[i].fraction) * 100.0); + + printf (" PERFSQR_MOD_%d (r, %2d, CNST_LIMB(0x", + factor[i].divisor <= limb_bits ? 1 : 2, + factor[i].divisor); + mpz_out_str (0, 16, factor[i].inverse); + printf ("), \\\n"); + printf (" CNST_LIMB(0x"); + + if ( factor[i].divisor <= limb_bits) + { + mpz_out_str (0, 16, factor[i].mask); + } + else + { + mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits); + mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits); + mpz_out_str (0, 16, mhi); + printf ("), CNST_LIMB(0x"); + mpz_out_str (0, 16, mlo); + } + printf (")); \\\n"); + } + + printf (" } while (0)\n"); + printf ("\n"); + + printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n", + (1.0 - (total_fraction * 44.0/256.0)) * 100.0); + printf ("\n"); + + printf ("/* helper for tests/mpz/t-perfsqr.c */\n"); + printf ("#define PERFSQR_DIVISORS { 256,"); + for (i = 0; i < nfactor; i++) + printf (" %d,", factor[i].divisor); + printf (" }\n"); + + + mpz_clear (mhi); + mpz_clear (mlo); +} + +int +main (int argc, char *argv[]) +{ + int limb_bits, nail_bits, numb_bits; + + if (argc != 3) + { + fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n"); + exit (1); + } + + limb_bits = atoi (argv[1]); + nail_bits = atoi (argv[2]); + + if (limb_bits <= 0 + || nail_bits < 0 + || nail_bits >= limb_bits) + { + fprintf (stderr, "Invalid limb/nail bits: %d %d\n", + limb_bits, nail_bits); + exit (1); + } + numb_bits = limb_bits - nail_bits; + + generate_sq_res_0x100 (limb_bits); + generate_mod (limb_bits, nail_bits); + + print (limb_bits, nail_bits); + + return 0; +} |