diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-03-04 15:18:10 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-03-04 15:18:10 +0100 |
commit | 16768e1f7f3ce6744c46ebf61008e0a68dd2c677 (patch) | |
tree | 5b34385d632839b16db742df62f7916e6ed4aa3e | |
parent | 3ac426dfe3a7bd9ab242098305ef7c4b775bac5c (diff) | |
download | nettle-16768e1f7f3ce6744c46ebf61008e0a68dd2c677.tar.gz |
ARM assembly for ecc_384_modp.
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | armv7/ecc-384-modp.asm | 257 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | ecc-384.c | 7 |
4 files changed, 273 insertions, 2 deletions
@@ -1,3 +1,12 @@ +2013-03-04 Niels Möller <nisse@lysator.liu.se> + + * configure.ac (asm_optional_list): Added ecc-384-modp.asm. Deleted + bogus reference to $asm_search_list. + + * ecc-384.c: Check HAVE_NATIVE_ecc_384_modp, and use native + version if available. + * armv7/ecc-384-modp.asm: New file, 3 time speedup over C version. + 2013-03-03 Niels Möller <nisse@lysator.liu.se> * ecc-256.c: Fixed definition of USE_REDC. diff --git a/armv7/ecc-384-modp.asm b/armv7/ecc-384-modp.asm new file mode 100644 index 00000000..e34d95f8 --- /dev/null +++ b/armv7/ecc-384-modp.asm @@ -0,0 +1,257 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-384-modp.asm" + .arm + +define(<RP>, <r1>) +define(<T0>, <r0>) +define(<T1>, <r2>) +define(<T2>, <r3>) +define(<T3>, <r4>) +define(<F0>, <r5>) +define(<F1>, <r6>) +define(<F2>, <r7>) +define(<F3>, <r8>) +define(<F4>, <r10>) +define(<N>, <r12>) +define(<H>, <lr>) + + C ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp) + .text + .align 2 + +PROLOGUE(nettle_ecc_384_modp) + push {r4,r5,r6,r7,r8,r10,lr} + + add RP, RP, #80 + ldm RP, {T0, T1, T2, T3} C 20-23 + + C First get top 4 limbs, which need folding twice, as + C + C T3 T2 T1 T0 + C T3 T2 T1 + C -T3 + C ---------------- + C F4 F3 F2 F1 F0 + C + C Start with + C + C T3 T1 T0 + C T1 + C -T3 + C ----------- + C F2 F1 F0 Always fits + + adds F0, T0, T1 + adcs F1, T1, #0 + adcs F2, T3, #0 + subs F0, F0, T3 + sbcs F1, F1, #0 + sbcs F2, F2, #0 + + C T3 T2 T2 0 + C F2 F1 F0 + C ---------------- + C F4 F3 F2 F1 F0 + + mov F4, #0 + adds F1, F1, T2 + adcs F2, F2, T2 + adcs F3, T3, #0 + adcs F4, F4, #0 + + C Add in to high part + sub RP, RP, #32 + ldm RP, {T0, T1, T2, T3} C 12-15 + mov H, #0 + adds F0, T0, F0 + adcs F1, T1, F1 + adcs F2, T2, F2 + adcs F3, T3, F3 + adcs F4, F4, #0 C Do F4 later + + C Add to low part, keeping carry (positive or negative) in H + sub RP, RP, #48 + ldm RP, {T0, T1, T2, T3} C 0-3 + mov H, #0 + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, #0 + subs T1, T1, F0 + sbcs T2, T2, F1 + sbcs T3, T3, F2 + sbc H, H, #0 + adds T3, T3, F0 + adc H, H, #0 + + stm RP!, {T0,T1,T2,T3} C 0-3 + mov N, #2 +.Loop: + ldm RP, {T0,T1,T2,T3} C 4-7 + + C First, propagate carry + adds T0, T0, H + asr H, #31 C Sign extend + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + adc H, H, #0 + + C +B^4 term + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, #0 + + C +B^3 terms + ldr F0, [RP, #+48] C 16 + adds T0, T0, F1 + adcs T1, T1, F2 + adcs T2, T2, F3 + adcs T3, T3, F0 + adc H, H, #0 + + C -B + ldr F1, [RP, #+52] C 17-18 + ldr F2, [RP, #+56] + subs T0, T0, F3 + sbcs T1, T1, F0 + sbcs T2, T2, F1 + sbcs T3, T3, F2 + sbcs H, H, #0 + + C +1 + ldr F3, [RP, #+60] C 19 + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, #0 + subs N, N, #1 + stm RP!, {T0,T1,T2,T3} + bne .Loop + + C Fold high limbs, we need to add in + C + C F4 F4 -F4 F4 H H -H H + C + C We always have F4 >= 0, but we can have H < 0. + C Sign extension gets tricky when F4 = 0 and H < 0. + sub RP, RP, #48 + + ldm RP, {T0,T1,T2,T3} C 0-3 + + C H H 0 -H H + C ---------------- + C S F4 F3 F2 F1 F0 + C + C Define S = H >> 31 (asr), we then have + C + C F0 = H + C F1 = S - H + C F2 = - [H > 0] + C F3 = H - [H > 0] + C F4 = H + S + C + C And we get underflow in S - H iff H > 0 + + C H = 0 H > 0 H = -1 + mov F0, H C 0 H -1 + asr H, #31 + subs F1, H, F0 C 0,C=1 -H,C=0 0,C=1 + sbc F2, F2, F2 C 0 -1 0 + sbc F3, F0, #0 C 0 H-1 -1 + + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + adc H, H, F0 C 0+cy H+cy -2+cy + + stm RP!, {T0,T1,T2,T3} C 0-3 + ldm RP, {T0,T1,T2,T3} C 4-7 + + C F4 0 -F4 + C --------- + C F3 F2 F1 + + rsbs F1, F4, #0 + sbc F2, F2, F2 + sbc F3, F4, #0 + + C Sign extend H + adds F0, F4, H + asr H, H, #31 + adcs F1, F1, H + adcs F2, F2, H + adcs F3, F3, H + adcs F4, F4, H + adc H, H, #0 + + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + + stm RP!, {T0,T1,T2,T3} C 4-7 + ldm RP, {T0,T1,T2,T3} C 8-11 + + adcs T0, T0, F4 + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + adc H, H, #0 + + stm RP, {T0,T1,T2,T3} C 8-11 + + C Final (unlikely) carry + sub RP, RP, #32 + ldm RP, {T0,T1,T2,T3} C 0-3 + C Fold H into F0-F4 + mov F0, H + asr H, #31 + subs F1, H, F0 + sbc F2, F2, F2 + sbc F3, F0, #0 + add F4, F0, H + + adds T0, T0, F0 + adcs T1, T1, F1 + adcs T2, T2, F2 + adcs T3, T3, F3 + + stm RP!, {T0,T1,T2,T3} C 0-3 + ldm RP, {T0,T1,T2,T3} C 4-7 + adcs T0, T0, F4 + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + stm RP!, {T0,T1,T2,T3} C 4-7 + ldm RP, {T0,T1,T2,T3} C 8-11 + adcs T0, T0, H + adcs T1, T1, H + adcs T2, T2, H + adcs T3, T3, H + stm RP!, {T0,T1,T2,T3} C 8-11 + pop {r4,r5,r6,r7,r8,r10,pc} +EPILOGUE(nettle_ecc_384_modp) diff --git a/configure.ac b/configure.ac index aa8817c5..64ca2397 100644 --- a/configure.ac +++ b/configure.ac @@ -250,7 +250,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ asm_optional_list="" if test "x$enable_public_key" = "xyes" ; then - asm_optional_list="$asm_search_list ecc-192-modp.asm ecc-224-modp.asm ecc-256-redc.asm" + asm_optional_list="ecc-192-modp.asm ecc-224-modp.asm ecc-256-redc.asm ecc-384-modp.asm" fi OPT_ASM_SOURCES="" @@ -36,13 +36,18 @@ #include "ecc-384.h" +#if HAVE_NATIVE_ecc_384_modp +#define ecc_384_modp nettle_ecc_384_modp +void +ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp); +#elif GMP_NUMB_BITS == 32 + /* Use that 2^{384} = 2^{128} + 2^{96} - 2^{32} + 1, and eliminate 256 bits at a time. We can get carry == 2 in the first iteration, and I think *only* in the first iteration. */ -#if GMP_NUMB_BITS == 32 /* p is 12 limbs, and B^12 - p = B^4 + B^3 - B + 1. We can eliminate almost 8 at a time. Do only 7, to avoid additional carry propagation, followed by 5. */ |