diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-04-16 19:15:53 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-06-19 13:49:48 +0300 |
commit | ccfa9f2c1427b40483984198c3df41f8057f69f8 (patch) | |
tree | cca120b4c26cf720b2dd09170ff35f60fe9af6f0 /mpi | |
parent | 9c12226c31d4fed4942c049724591a82c4428408 (diff) | |
download | libgcrypt-ccfa9f2c1427b40483984198c3df41f8057f69f8.tar.gz |
mpi/ec: small optimization for ec_mulm_448
* mpi/ec.c (ec_addm_448, ec_subm_448): Change order of sub_n and
set_cond to remove need to clear 'n'.
(ec_mulm_448): Use memcpy where possible; Use mpih_rshift where
possible; Use mpih_lshift for doubling a3; Remove one addition
at end.
--
Benchmarks on AMD Ryzen 7 5800X:
Before:
Ed448 | nanosecs/iter cycles/iter auto Mhz
keygen | 893096 4343326 4863
sign | 988422 4795694 4852
verify | 1899706 9215952 4851
After (~5% faster):
Ed448 | nanosecs/iter cycles/iter auto Mhz
keygen | 822078 3987952 4851
sign | 947327 4595433 4851
verify | 1776259 8616675 4851
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'mpi')
-rw-r--r-- | mpi/ec.c | 76 |
1 files changed, 22 insertions, 54 deletions
@@ -464,14 +464,13 @@ ec_addm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) log_bug ("addm_448: different sizes\n"); - memset (n, 0, sizeof n); up = u->d; vp = v->d; wp = w->d; cy = _gcry_mpih_add_n (wp, up, vp, wsize); - mpih_set_cond (n, ctx->p->d, wsize, (cy != 0UL)); - _gcry_mpih_sub_n (wp, wp, n, wsize); + _gcry_mpih_sub_n (n, wp, ctx->p->d, wsize); + mpih_set_cond (wp, n, wsize, (cy != 0UL)); } static void @@ -485,14 +484,13 @@ ec_subm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) log_bug ("subm_448: different sizes\n"); - memset (n, 0, sizeof n); up = u->d; vp = v->d; wp = w->d; borrow = _gcry_mpih_sub_n (wp, up, vp, wsize); - mpih_set_cond (n, ctx->p->d, wsize, (borrow != 0UL)); - _gcry_mpih_add_n (wp, wp, n, wsize); + _gcry_mpih_add_n (n, wp, ctx->p->d, wsize); + mpih_set_cond (wp, n, wsize, (borrow != 0UL)); } static void @@ -506,10 +504,6 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) mpi_limb_t b0[LIMB_SIZE_HALF_448]; mpi_limb_t b1[LIMB_SIZE_HALF_448]; mpi_limb_t cy; - int i; -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - mpi_limb_t b1_rest, a3_rest; -#endif if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) log_bug ("mulm_448: different sizes\n"); @@ -520,61 +514,37 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) _gcry_mpih_mul_n (n, up, vp, wsize); - for (i = 0; i < (wsize + 1)/ 2; i++) - { - b0[i] = n[i]; - b1[i] = n[i+wsize/2]; - a2[i] = n[i+wsize]; - a3[i] = n[i+wsize+wsize/2]; - } + memcpy (b0, n, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB); + memcpy (a2, n + wsize, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB); #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) b0[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL<<32)-1; a2[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL<<32)-1; - - b1_rest = 0; - a3_rest = 0; - - for (i = (wsize + 1)/ 2 -1; i >= 0; i--) - { - mpi_limb_t b1v, a3v; - b1v = b1[i]; - a3v = a3[i]; - b1[i] = (b1_rest<<32) | (b1v >> 32); - a3[i] = (a3_rest<<32) | (a3v >> 32); - b1_rest = b1v & (((mpi_limb_t)1UL <<32)-1); - a3_rest = a3v & (((mpi_limb_t)1UL <<32)-1); - } + _gcry_mpih_rshift (b1, n + wsize/2, LIMB_SIZE_HALF_448, 32); + _gcry_mpih_rshift (a3, n + wsize + wsize/2, LIMB_SIZE_HALF_448, 32); +#else + memcpy (b1, n + wsize/2, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB); + memcpy (a3, n + wsize + wsize/2, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB); #endif cy = _gcry_mpih_add_n (b0, b0, a2, LIMB_SIZE_HALF_448); - cy += _gcry_mpih_add_n (b0, b0, a3, LIMB_SIZE_HALF_448); - for (i = 0; i < (wsize + 1)/ 2; i++) - wp[i] = b0[i]; + cy += _gcry_mpih_add_n (wp, b0, a3, LIMB_SIZE_HALF_448); #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) + cy = wp[LIMB_SIZE_HALF_448-1] >> 32; wp[LIMB_SIZE_HALF_448-1] &= (((mpi_limb_t)1UL <<32)-1); #endif + memset (b0, 0, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB); + b0[0] = cy; -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - cy = b0[LIMB_SIZE_HALF_448-1] >> 32; -#endif - - cy = _gcry_mpih_add_1 (b1, b1, LIMB_SIZE_HALF_448, cy); + cy = _gcry_mpih_add_n (b1, b1, b0, LIMB_SIZE_HALF_448); + cy += _gcry_mpih_lshift (a3, a3, LIMB_SIZE_HALF_448, 1); cy += _gcry_mpih_add_n (b1, b1, a2, LIMB_SIZE_HALF_448); cy += _gcry_mpih_add_n (b1, b1, a3, LIMB_SIZE_HALF_448); - cy += _gcry_mpih_add_n (b1, b1, a3, LIMB_SIZE_HALF_448); #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - b1_rest = 0; - for (i = (wsize + 1)/ 2 -1; i >= 0; i--) - { - mpi_limb_t b1v = b1[i]; - b1[i] = (b1_rest<<32) | (b1v >> 32); - b1_rest = b1v & (((mpi_limb_t)1UL <<32)-1); - } - wp[LIMB_SIZE_HALF_448-1] |= (b1_rest << 32); + cy = _gcry_mpih_rshift (b1, b1, LIMB_SIZE_HALF_448, 32); + wp[LIMB_SIZE_HALF_448-1] |= cy; #endif - for (i = 0; i < wsize / 2; i++) - wp[i+(wsize + 1) / 2] = b1[i]; + memcpy (wp + LIMB_SIZE_HALF_448, b1, (wsize / 2) * BYTES_PER_MPI_LIMB); #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) cy = b1[LIMB_SIZE_HALF_448-1]; @@ -590,10 +560,8 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) n[0] = cy; _gcry_mpih_add_n (wp, wp, n, wsize); - memset (n, 0, wsize * BYTES_PER_MPI_LIMB); - cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize); - mpih_set_cond (n, ctx->p->d, wsize, (cy != 0UL)); - _gcry_mpih_add_n (wp, wp, n, wsize); + cy = _gcry_mpih_sub_n (n, wp, ctx->p->d, wsize); + mpih_set_cond (wp, n, wsize, (cy == 0UL)); } static void |