commit previous reviewd version of mpmontg.c to the TMP branch as a baseline

for the 'final' version.
author: relyea%netscape.com <devnull@localhost> 2005-11-11 19:53:26 +0000
committer: relyea%netscape.com <devnull@localhost> 2005-11-11 19:53:26 +0000
commit: 885fa706d4fc6806355c184035d4c3553b57a5e2 (patch)
tree: 95f25a28a5d924c7ad4ee9b8f0995f637ebb8d20
parent: 146060d17ac1ee1fabef61661ce40bb4f869f694 (diff)
download: nss-hg-885fa706d4fc6806355c184035d4c3553b57a5e2.tar.gz
1 files changed, 350 insertions, 130 deletions
diff --git a/security/nss/lib/freebl/mpi/mpmontg.c b/security/nss/lib/freebl/mpi/mpmontg.c
index cc4d233fd..13a95b172 100644
--- a/security/nss/lib/freebl/mpi/mpmontg.c
+++ b/security/nss/lib/freebl/mpi/mpmontg.c
@@ -50,6 +50,7 @@
 /* #define MP_USING_MONT_MULF 1 */
 #define MP_USING_CACHE_SAFE_MOD_EXP 1 
 #define MP_USING_WEAVE_COPY 1 
+#define MP_CHAR_STORE_SLOW 1
 #include <string.h>
 #include "mpi-priv.h"
 #include "mp_gf2m-priv.h"
@@ -58,10 +59,15 @@
 #ifdef MP_USING_MONT_MULF
 #include "montmulf.h"
 #endif
-#include "prtypes.h"
+#include <stddef.h> /* ptrdiff_t */
 
-#include <fcntl.h>
-#include <unistd.h>
+/* need to know endianness of this platform. If we aren't told, get it from
+ * nspr... */
+#ifdef MP_CHAR_STORE_SLOW
+#if !defined(IS_BIG_ENDIAN) && !defined(IS_LITTLE_ENDIAN)
+#include "prcpucfg.h"
+#endif
+#endif
 
 #define STATIC
 /* #define DEBUG 1  */
@@ -70,8 +76,8 @@
 #define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */
 #define MAX_POWERS MAX_ODD_INTS*2
 #define MAX_MODULUS_BITS 8192
-#define MAX_MODULUS_LENGTH (MAX_MODULUS_BITS/8)
-#define MAX_MODULUS_DIGITS (MAX_MODULUS_LENGTH/sizeof(mp_digit))
+#define MAX_MODULUS_BYTES (MAX_MODULUS_BITS/8)
+#define MAX_MODULUS_DIGITS (MAX_MODULUS_BYTES/sizeof(mp_digit))
 
 #if defined(_WIN32_WCE)
 #define ABORT  res = MP_UNDEF; goto CLEANUP
@@ -523,77 +529,97 @@ CLEANUP:
 #undef MUL
 
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
-
 unsigned int mp_using_cache_safe_exp = 1;
+#endif
 
-void mp_set_mode_modify() { mp_using_cache_safe_exp = 0; }
-void mp_set_mode_safe() { mp_using_cache_safe_exp = 1; }
+mp_err mp_set_modexp_mode(int value)
+{
+#ifdef MP_USING_CACHE_SAFE_MOD_EXP
+ mp_using_cache_safe_exp = value;
+ return MP_OKAY;
+#else
+ if (value == 0) {
+   return MP_OKAY;
+ }
+ return MP_BADARG;
+#endif
+}
+
+
+#ifdef MP_USING_CACHE_SAFE_MOD_EXP
 
 #ifndef MP_USING_WEAVE_COPY
-#if MP_DIGIT_BITS == 32
+#ifndef MP_CHAR_STORE_SLOW
+#define WEAVE_BASE_INIT \
+  unsigned char *_ptr;
+
+#define WEAVE_FIRST(bi,b,count) \
+  _ptr = (unsigned char *)bi; \
+  *_ptr++ = *b; b+= count;
+
+#define WEAVE_MIDDLE(bi,b,count) \
+  *_ptr++ = *b; b+= count;
+
+#define WEAVE_LAST(bi,b,count) \
+  *_ptr++ = *b; b+= count; 
+
+#else
+#define WEAVE_BASE_INIT \
+  register mp_digit _digit;
+
+#define WEAVE_FIRST(bi,b,count) \
+  _digit = *b << 8; b += count; 
+
+#define WEAVE_MIDDLE(bi,b,count) \
+  _digit |= *b; b += count; _digit = _digit << 8; 
+
+#define WEAVE_LAST(bi,b,count) \
+  _digit |= *b; b += count; \
+  *bi = _digit;
+#endif /* MP_CHAR_STORE_SLOW */
+
+#if MP_DIGIT_BITS == 32 
 #define WEAVE_INIT  \
-   unsigned char *_ptr;
+  WEAVE_BASE_INIT
 
 #define WEAVE_FETCH(bi, b, count) \
-   _ptr = (unsigned char *)bi; \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;
-
-#define WEAVE_PUT(bi, b, count) \
-   _ptr = (unsigned char *)bi; \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;
+  WEAVE_FIRST(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_LAST(bi,b,count)
+
 #else
-#if MP_DIGIT_BITS == 64
+#ifdef MP_DIGIT_BITS == 64 
+
 #define WEAVE_INIT  \
-   unsigned char *_ptr
+  WEAVE_BASE_INIT
 
 #define WEAVE_FETCH(bi, b, count) \
-   _ptr = (unsigned char *)bi; \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;  \
-   *_ptr++ = *b; b+= count;
-
-#define WEAVE_PUT(bi, b, count) \
-   _ptr = (unsigned char *)bi; \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;  \
-   *b = *_ptr++; b+= count;
+  WEAVE_FIRST(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_MIDDLE(bi,b,count) \
+  WEAVE_LAST(bi,b,count)
+
 #else
 
 #define WEAVE_INIT \
-   int _i; \
-   unsigned char *_ptr;
+  int _i; \
+  WEAVE_BASE_INIT
 
-   /* It would be nice to unroll this loop as well */
+  /* It would be nice to unroll this loop as well */
 #define WEAVE_FETCH(bi, b, count) \
-   _ptr = (unsigned char *)bi; \
-   for (_i=0; _i < sizeof mp_digit ; _i++) { \
-	*_ptr++ = *b; \
-	b+=count; \
-   }
-
-#define WEAVE_PUT(bi, b, count) \
-   _ptr = (unsigned char *)bi; \
-   for (_i=0; _i < sizeof mp_digit ; _i++) { \
-	*b = *_ptr++; \
-	b+=count; \
-   }
-#endif 
+  WEAVE_FIRST(bi,b,count) \
+  WEAVE_LAST(bi,b,count) \
+  for (_i=1; _i < sizeof(mp_digit) -1 ; _i++) { \
+    WEAVE_MIDDLE(bi,b,count) \
+  } \
+  WEAVE_LAST(bi,b,count)
+
+#endif
 #endif
 
 #if !defined(MP_MONT_USE_MP_MUL)
@@ -702,64 +728,241 @@ CLEANUP:
 } /* end mp_mul() */
 #endif /* MP_USING_WEAVE_COPY */
 
+#define WEAVE_WORD_SIZE 4
+
+#ifndef MP_CHAR_STORE_SLOW
+mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 
+			             mp_size b_size,  mp_size count)
+{
+  mp_size i, j;
+  unsigned char *bsave = b;
+
+  for (i=0; i < WEAVE_WORD_SIZE; i++) {
+    unsigned char *pb = (unsigned char *)MP_DIGITS(&a[i]);
+    mp_size useda = MP_USED(&a[i]);
+    mp_size zero =  b_size - useda;
+    unsigned char *end = pb+ (useda*sizeof(mp_digit));
+    b = bsave+i;
+
+
+    ARGCHK(MP_SIGN(&a[i]) == 0, MP_BADARG);
+    ARGCHK(useda <= b_size, MP_BADARG);
+
+    for (; pb < end; pb++) {
+      *b = *pb;
+      b += count;
+    }
+    for (j=0; j < zero; j++) {
+      *b = 0;
+      b += count;
+    }
+  }
+
+  return MP_OKAY;
+}
+#else
+/* Need a primitive that we know is 32 bits long... */
+#if UINT_MAX == MP_32BIT_MAX
+typedef unsigned int mp_weave_word;
+#else
+#if ULONG_MAX == MP_32BIT_MAX
+typedef unsigned long mp_weave_word;
+#else
+#error "Can't find 32 bit primitive type for this platform"
+#endif
+#endif
+
+/*
+ * on some platforms character stores into memory is very expensive since they
+ * generate a read/modify/write operation on the bus. On those platforms
+ * we need to do integer writes to the bus.
+ *
+ * The weave_to_mpi function in those cases expect the data to be laid out in 
+ * big endian, interleaved. 
+ * 
+ * since we need to interleave on a byte by byte basis, we need to collect 
+ * several mpi structures together into a single uint32 before we write. We
+ * also need to make sure the uint32 is arranged so that the first value of 
+ * the first array winds up in b[0]. This means construction of that uint32
+ * is endian specific (even though the layout of the array is always big
+ * endian.
+ */
 mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 
 					mp_size b_size, mp_size count)
 {
   mp_size i;
-  unsigned char *pb = (unsigned char *)MP_DIGITS(a);
-  mp_size useda = MP_USED(a);
-  mp_size zero =  b_size - useda;
-  unsigned char *end = pb+ (useda*sizeof(mp_digit));
+  mp_digit *digitsa0;
+  mp_digit *digitsa1;
+  mp_digit *digitsa2;
+  mp_digit *digitsa3;
+  mp_size   useda0;
+  mp_size   useda1;
+  mp_size   useda2;
+  mp_size   useda3;
+  mp_weave_word *weaved = (mp_weave_word *)b;
+#if MP_DIGIT_BITS != 32 && MP_DIGIT_BITS != 64
+  mp_size j;
+#endif
 
-  ARGCHK(MP_SIGN(a) == 0, MP_BADARG);
-  ARGCHK(useda <= b_size, MP_BADARG);
+  count = count/sizeof(mp_weave_word);
+
+  /* this code pretty much depends on this ! */
+  /*assert(WEAVE_WORD_SIZE == 4); */
+
+  digitsa0 = MP_DIGITS(&a[0]);
+  digitsa1 = MP_DIGITS(&a[1]);
+  digitsa2 = MP_DIGITS(&a[2]);
+  digitsa3 = MP_DIGITS(&a[3]);
+  useda0 = MP_USED(&a[0]);
+  useda1 = MP_USED(&a[1]);
+  useda2 = MP_USED(&a[2]);
+  useda3 = MP_USED(&a[3]);
+
+  ARGCHK(MP_SIGN(&a[0]) == 0, MP_BADARG);
+  ARGCHK(MP_SIGN(&a[1]) == 0, MP_BADARG);
+  ARGCHK(MP_SIGN(&a[2]) == 0, MP_BADARG);
+  ARGCHK(MP_SIGN(&a[3]) == 0, MP_BADARG);
+  ARGCHK(useda0 <= b_size, MP_BADARG);
+  ARGCHK(useda1 <= b_size, MP_BADARG);
+  ARGCHK(useda2 <= b_size, MP_BADARG);
+  ARGCHK(useda3 <= b_size, MP_BADARG);
+
+#define SAFE_FETCH(digit, used, word) ((i) < (used) ? (digit[i]) : 0)
+
+  for (i=0; i < b_size; i++) {
+    mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i);
+    mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i);
+    mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i);
+    mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i);
+    register mp_weave_word acc;
 
-  for (; pb < end; pb++) {
-    *b = *pb;
-    b += count;
-  }
-  for (i=0; i < zero; i++) {
-    *b = 0;
-    b += count;
+/*
+ * ONE_STEP takes to MSB of each of our current digits and places that
+ * byte in the appropriate position for writing to the weaved array.
+ *  On little endian:
+ *   b3 b2 b1 b0
+ *  On big endian:
+ *   b0 b1 b2 b3
+ *  When the data is written it would always wind up:
+ *   b[0] = b0
+ *   b[1] = b1
+ *   b[2] = b2
+ *   b[3] = b3
+ *
+ * Once weave written the MSB, we shift the whole digit up left one
+ * byte, putting the Next Most Significant Byte in the MSB position,
+ * so we we repeat the next one step that byte will be written.
+ */
+#ifdef IS_LITTLE_ENDIAN 
+#define MPI_WEAVE_ONE_STEP \
+    acc  = (d0 >> (MP_DIGIT_BITS-8))  & 0xff      ; d0 <<= 8; /*b0*/ \
+    acc |= (d1 >> (MP_DIGIT_BITS-16)) & 0xff00    ; d1 <<= 8; /*b1*/ \
+    acc |= (d2 >> (MP_DIGIT_BITS-24)) & 0xff0000  ; d2 <<= 8; /*b2*/ \
+    acc |= (d3 >> (MP_DIGIT_BITS-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
+    *weaved = acc; weaved += count;
+#else 
+#error "Intel is Little endian, but IS_LITTLE_ENDIAN is not defined!"
+#define MPI_WEAVE_ONE_STEP \
+    acc  = (d0 >> (MP_DIGIT_BITS-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
+    acc |= (d1 >> (MP_DIGIT_BITS-24)) & 0xff0000  ; d1 <<= 8; /*b1*/ \
+    acc |= (d2 >> (MP_DIGIT_BITS-16)) & 0xff00    ; d2 <<= 8; /*b2*/ \
+    acc |= (d3 >> (MP_DIGIT_BITS-8))  & 0xff      ; d3 <<= 8; /*b3*/ \
+    *weaved = acc; weaved += count;
+#endif 
+
+#if MP_DIGIT_BITS == 32 || MP_DIGIT_BITS == 64
+    MPI_WEAVE_ONE_STEP
+    MPI_WEAVE_ONE_STEP
+    MPI_WEAVE_ONE_STEP
+    MPI_WEAVE_ONE_STEP
+#if MP_DIGIT_BITS == 64
+    MPI_WEAVE_ONE_STEP
+    MPI_WEAVE_ONE_STEP
+    MPI_WEAVE_ONE_STEP
+    MPI_WEAVE_ONE_STEP
+#endif
+#else
+    for (j=0; j < sizeof (mp_digit); j++) {
+      MPI_WEAVE_ONE_STEP
+    }
+#endif
   }
 
   return MP_OKAY;
 }
+#endif
 
 #ifdef MP_USING_WEAVE_COPY
+#ifndef MP_CHAR_STORE_SLOW
 mp_err weave_to_mpi(mp_int *a, const unsigned char *b, 
 					mp_size b_size, mp_size count)
 {
-  unsigned char *pb = (unsigned char *)MP_DIGITS(a);
+  unsigned char  *pb = (unsigned char *)MP_DIGITS(a);
   unsigned char *end = pb+ (b_size*sizeof(mp_digit));
 
   MP_SIGN(a) = 0;
   MP_USED(a) = b_size;
 
-  for (; pb < end; pb++) {
+  for (; pb < end; b+=count, pb++) {
     *pb = *b;
-    b += count;
   }
   return MP_OKAY;
 }
+#else
+mp_err weave_to_mpi(mp_int *a, const unsigned char *b, 
+					mp_size b_size, mp_size count)
+{
+  mp_digit *pb = MP_DIGITS(a);
+  mp_digit *end = &pb[b_size];
+
+  MP_SIGN(a) = 0;
+  MP_USED(a) = b_size;
+
+  for (; pb < end; pb++) {
+    register mp_digit digit;
+
+    digit = *b << 8; b += count;
+#if MP_DIGIT_BITS == 32 || MP_DIGIT_BITS == 64
+    digit |= *b; b += count; digit = digit << 8;
+    digit |= *b; b += count; digit = digit << 8;
+#if MP_DIGIT_BITS == 64
+    digit |= *b; b += count; digit = digit << 8;
+    digit |= *b; b += count; digit = digit << 8;
+    digit |= *b; b += count; digit = digit << 8;
+    digit |= *b; b += count; digit = digit << 8;
+#endif
+#else
+    for (i=1; i < sizeof(mp_digit)-1; i++) {
+	digit |= *b; b += count; digit = digit << 8;
+    }
 #endif
+    digit |= *b; b += count; 
+
+    *pb = digit;
+  }
+  return MP_OKAY;
+}
+#endif
+#endif /* MP_USING_WEAVE_COPY */
 
 
 #define SQR(a,b) \
   MP_CHECKOK( mp_sqr(a, b) );\
   MP_CHECKOK( s_mp_redc(b, mmm) );
 
-#ifdef MP_USING_WEAVE_COPY
 #if defined(MP_MONT_USE_MP_MUL)
-#define MUL(x,a,b) \
-  MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \
-  MP_CHECKOK( mp_mul_weave(a, &tmp, b) ); \
+#define MUL_NOWEAVE(x,a,b) \
+  MP_CHECKOK( mp_mul(a, x, b) ); \
   MP_CHECKOK( s_mp_redc(b, mmm) ) ; 
 #else
+#define MUL_NOWEAVE(x,a,b) \
+  MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) );
+#endif
+
+#ifdef MP_USING_WEAVE_COPY
 #define MUL(x,a,b) \
   MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \
-  MP_CHECKOK( s_mp_mul_mont(a, &tmp, b, mmm) );
-#endif
+  MUL_NOWEAVE(&tmp,a,b)
 #else
 #if defined(MP_MONT_USE_MP_MUL)
 #define MUL(x,a,b) \
@@ -769,7 +972,7 @@ mp_err weave_to_mpi(mp_int *a, const unsigned char *b,
 #define MUL(x,a,b) \
   MP_CHECKOK( s_mp_mul_mont_weave(a, powers + (x), nLen, num_powers, b, mmm) );
 #endif
-#endif
+#endif /* MP_USING_WEAVE_COPY */
 
 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
 #define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(~((y)-1)))
@@ -786,15 +989,15 @@ mp_err mp_exptmod_safe_i(const mp_int *   montBase,
 		    mp_size          num_powers)
 {
   mp_int *pa1, *pa2, *ptmp;
-  mp_size i, j;
+  mp_size i;
   mp_size first_window;
   mp_err  res;
   int     expOff;
-  mp_int  accum1, accum2;
+  mp_int  accum1, accum2, accum[WEAVE_WORD_SIZE];
 #ifdef MP_USING_WEAVE_COPY
   mp_int  tmp;
 #endif
-  unsigned char powersArray[MAX_POWERS * (MAX_MODULUS_LENGTH+1)];
+  unsigned char powersArray[MAX_POWERS * (MAX_MODULUS_BYTES+1)];
   unsigned char *powers;
 
   ARGCHK( nLen <= MAX_MODULUS_DIGITS , MP_BADARG);
@@ -804,6 +1007,10 @@ mp_err mp_exptmod_safe_i(const mp_int *   montBase,
 
   MP_DIGITS(&accum1) = 0;
   MP_DIGITS(&accum2) = 0;
+  MP_DIGITS(&accum[0]) = 0;
+  MP_DIGITS(&accum[1]) = 0;
+  MP_DIGITS(&accum[2]) = 0;
+  MP_DIGITS(&accum[3]) = 0;
 
   /* grab the first window value. This allows us to preload accumulator1
    * and save a conversion, some squares and a multiple*/
@@ -811,62 +1018,75 @@ mp_err mp_exptmod_safe_i(const mp_int *   montBase,
 				bits_in_exponent-window_bits, window_bits) );
   first_window = (mp_size)res;
 
-  MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
+  MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );
+  MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );
+  MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );
+  MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );
   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
+  MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
 #ifdef MP_USING_WEAVE_COPY
   MP_DIGITS(&tmp) = 0;
   MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) );
 #endif
 
-  mp_set(&accum2, 1);
-  MP_CHECKOK( s_mp_to_mont(&accum2, mmm, &accum2) );
-  /* unlike mp_copy_init, mp_copy is from, to */
-  /* can this be an assert? If we are clamped, we shouldn't ever have a case
-   * where the first window is '0' */
-  if (first_window == 0) {
-    MP_CHECKOK( mp_copy(&accum2, &accum1) );
-  }
-  MP_CHECKOK( mpi_to_weave(&accum2, powers, nLen, num_powers) );
-
-  MP_CHECKOK( mp_copy(montBase, &accum2) );
-  if (first_window == 1) {
-    MP_CHECKOK( mp_copy(&accum2, &accum1) );
+  /* build the first 4 powers inline */
+  if (num_powers > 2) {
+    mp_set(&accum[0], 1);
+    MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) );
+    MP_CHECKOK( mp_copy(montBase, &accum[1]) );
+    SQR(montBase, &accum[2]);
+    MUL_NOWEAVE(montBase, &accum[2], &accum[3]);
+    MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) );
+    if (first_window < 4) {
+      MP_CHECKOK( mp_copy(&accum[first_window], &accum1) );
+      first_window = num_powers;
+    }
+  } else {
+      /* assert first_window == 1? */
+      MP_CHECKOK( mp_copy(montBase, &accum1) );
   }
-  MP_CHECKOK( mpi_to_weave(&accum2, powers+1, nLen, num_powers) );
 
 
   /* this adds 2**(k-1)-2 square operations over just calculating the
    * odd powers where k is the window size. We will get some of that
    * back by not needing the first 'N' squares for the window (though
    * squaring 1 is extremely fast, so it's not much savings) */ 
-
-  /* This loop is like this so we can calculate all the powers with only 1
-   * temp variable. This saves us from needing a weaved square routine.
-   */
-  for (i = 2; i < num_powers; i++) {
-    if (i == 2 ) {
-        MP_CHECKOK( mp_sqr(&accum2, &accum2) );
-        MP_CHECKOK( s_mp_redc(&accum2, mmm) );
-	if (first_window == i) {
-	    MP_CHECKOK( mp_copy(&accum2, &accum1) );
-	}
-	MP_CHECKOK( mpi_to_weave(&accum2, powers+i, nLen, num_powers) );
-    } else if ( i & 1 ) {
-	MUL(i-1, montBase, &accum2);
-	if (first_window == i) {
-	    MP_CHECKOK( mp_copy(&accum2, &accum1) );
-	}
-	MP_CHECKOK( mpi_to_weave(&accum2, powers+i, nLen, num_powers) );
+  for (i = 4; i < num_powers; i++) {
+    int acc_index = i & 0x3; /* i % 4 */
+    if ( i & 1 ) {
+      MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]);
+      /* we've filled the array do our 'per array' processing */
+      if (acc_index == 3) {
+        MP_CHECKOK( mpi_to_weave(accum, powers + i - 3, nLen, num_powers) );
+
+        if (first_window <= i) {
+          MP_CHECKOK( mp_copy(&accum[first_window & 0x3], &accum1) );
+          first_window = num_powers;
+        }
+      }
     } else {
-	continue;
-    }
-    for (j=i*2; j < num_powers; j *= 2) {
-        MP_CHECKOK( mp_sqr(&accum2, &accum2) );
-        MP_CHECKOK( s_mp_redc(&accum2, mmm) );
-	if (first_window == j) {
-	    MP_CHECKOK( mp_copy(&accum2, &accum1) );
+      /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source
+       * and target are the same so we need to copy.. After that, the
+       * value is overwritten, so we need to fetch it from the stored
+       * weave array */
+      if (i > 8) {
+#ifdef MP_USING_WEAVE_COPY
+        MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers));
+        SQR(&accum2, &accum[acc_index]);
+#else
+	int  prev_index = (acc_index - 1) & 0x3;
+        MUL_NOWEAVE(montBase, &accum[prev_index] , &accum[acc_index]);
+#endif
+      } else {
+	int half_power_index = (i/2) & 0x3;
+	if (half_power_index == acc_index) {
+	   /* copy is cheaper than weave_to_mpi */
+	   MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2));
+	   SQR(&accum2,&accum[acc_index]);
+	} else {
+	   SQR(&accum[half_power_index],&accum[acc_index]);
 	}
-	MP_CHECKOK( mpi_to_weave(&accum2, powers+j, nLen, num_powers) );
+      }
     }
   }
   /* if the accum1 isn't set, then either j was out of range, or our logic
@@ -889,7 +1109,7 @@ mp_err mp_exptmod_safe_i(const mp_int *   montBase,
 	if (!smallExp) {
 	    SQR(pa1,pa2); SWAPPA;
 	} else if (smallExp & 1) {
-	    SQR(pa1,pa2); MUL(1,pa2,pa1);
+	    SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1);
 	} else {
 	    ABORT;
 	}
@@ -933,7 +1153,7 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
   mp_int  montBase, goodBase;
   mp_mont_modulus mmm;
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
-  static int max_window_bits;
+  static unsigned int max_window_bits;
 #endif
 
   /* function for computing n0prime only works if n0 is odd */
@@ -1006,7 +1226,7 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
    * the cache line size.
    */
   if (!max_window_bits) {
-    unsigned long cache_size = mpi_getProcessorLineSize();
+    unsigned long cache_size = s_mpi_getProcessorLineSize();
     /* processor has no cache, use 'fast' code always */
     if (cache_size == 0) {
       mp_using_cache_safe_exp = 0;
author	relyea%netscape.com <devnull@localhost>	2005-11-11 19:53:26 +0000
committer	relyea%netscape.com <devnull@localhost>	2005-11-11 19:53:26 +0000
commit	885fa706d4fc6806355c184035d4c3553b57a5e2 (patch)
tree	95f25a28a5d924c7ad4ee9b8f0995f637ebb8d20
parent	146060d17ac1ee1fabef61661ce40bb4f869f694 (diff)
download	nss-hg-885fa706d4fc6806355c184035d4c3553b57a5e2.tar.gz