summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-03-04 15:18:10 +0100
committerNiels Möller <nisse@lysator.liu.se>2013-03-04 15:18:10 +0100
commit16768e1f7f3ce6744c46ebf61008e0a68dd2c677 (patch)
tree5b34385d632839b16db742df62f7916e6ed4aa3e
parent3ac426dfe3a7bd9ab242098305ef7c4b775bac5c (diff)
downloadnettle-16768e1f7f3ce6744c46ebf61008e0a68dd2c677.tar.gz
ARM assembly for ecc_384_modp.
-rw-r--r--ChangeLog9
-rw-r--r--armv7/ecc-384-modp.asm257
-rw-r--r--configure.ac2
-rw-r--r--ecc-384.c7
4 files changed, 273 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 832eec45..162699af 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-03-04 Niels Möller <nisse@lysator.liu.se>
+
+ * configure.ac (asm_optional_list): Added ecc-384-modp.asm. Deleted
+ bogus reference to $asm_search_list.
+
+ * ecc-384.c: Check HAVE_NATIVE_ecc_384_modp, and use native
+ version if available.
+ * armv7/ecc-384-modp.asm: New file, 3 time speedup over C version.
+
2013-03-03 Niels Möller <nisse@lysator.liu.se>
* ecc-256.c: Fixed definition of USE_REDC.
diff --git a/armv7/ecc-384-modp.asm b/armv7/ecc-384-modp.asm
new file mode 100644
index 00000000..e34d95f8
--- /dev/null
+++ b/armv7/ecc-384-modp.asm
@@ -0,0 +1,257 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-384-modp.asm"
+ .arm
+
+define(<RP>, <r1>)
+define(<T0>, <r0>)
+define(<T1>, <r2>)
+define(<T2>, <r3>)
+define(<T3>, <r4>)
+define(<F0>, <r5>)
+define(<F1>, <r6>)
+define(<F2>, <r7>)
+define(<F3>, <r8>)
+define(<F4>, <r10>)
+define(<N>, <r12>)
+define(<H>, <lr>)
+
+ C ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_384_modp)
+ push {r4,r5,r6,r7,r8,r10,lr}
+
+ add RP, RP, #80
+ ldm RP, {T0, T1, T2, T3} C 20-23
+
+ C First get top 4 limbs, which need folding twice, as
+ C
+ C T3 T2 T1 T0
+ C T3 T2 T1
+ C -T3
+ C ----------------
+ C F4 F3 F2 F1 F0
+ C
+ C Start with
+ C
+ C T3 T1 T0
+ C T1
+ C -T3
+ C -----------
+ C F2 F1 F0 Always fits
+
+ adds F0, T0, T1
+ adcs F1, T1, #0
+ adcs F2, T3, #0
+ subs F0, F0, T3
+ sbcs F1, F1, #0
+ sbcs F2, F2, #0
+
+ C T3 T2 T2 0
+ C F2 F1 F0
+ C ----------------
+ C F4 F3 F2 F1 F0
+
+ mov F4, #0
+ adds F1, F1, T2
+ adcs F2, F2, T2
+ adcs F3, T3, #0
+ adcs F4, F4, #0
+
+ C Add in to high part
+ sub RP, RP, #32
+ ldm RP, {T0, T1, T2, T3} C 12-15
+ mov H, #0
+ adds F0, T0, F0
+ adcs F1, T1, F1
+ adcs F2, T2, F2
+ adcs F3, T3, F3
+ adcs F4, F4, #0 C Do F4 later
+
+ C Add to low part, keeping carry (positive or negative) in H
+ sub RP, RP, #48
+ ldm RP, {T0, T1, T2, T3} C 0-3
+ mov H, #0
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+ subs T1, T1, F0
+ sbcs T2, T2, F1
+ sbcs T3, T3, F2
+ sbc H, H, #0
+ adds T3, T3, F0
+ adc H, H, #0
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ mov N, #2
+.Loop:
+ ldm RP, {T0,T1,T2,T3} C 4-7
+
+ C First, propagate carry
+ adds T0, T0, H
+ asr H, #31 C Sign extend
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ adc H, H, #0
+
+ C +B^4 term
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+
+ C +B^3 terms
+ ldr F0, [RP, #+48] C 16
+ adds T0, T0, F1
+ adcs T1, T1, F2
+ adcs T2, T2, F3
+ adcs T3, T3, F0
+ adc H, H, #0
+
+ C -B
+ ldr F1, [RP, #+52] C 17-18
+ ldr F2, [RP, #+56]
+ subs T0, T0, F3
+ sbcs T1, T1, F0
+ sbcs T2, T2, F1
+ sbcs T3, T3, F2
+ sbcs H, H, #0
+
+ C +1
+ ldr F3, [RP, #+60] C 19
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+ subs N, N, #1
+ stm RP!, {T0,T1,T2,T3}
+ bne .Loop
+
+ C Fold high limbs, we need to add in
+ C
+ C F4 F4 -F4 F4 H H -H H
+ C
+ C We always have F4 >= 0, but we can have H < 0.
+ C Sign extension gets tricky when F4 = 0 and H < 0.
+ sub RP, RP, #48
+
+ ldm RP, {T0,T1,T2,T3} C 0-3
+
+ C H H 0 -H H
+ C ----------------
+ C S F4 F3 F2 F1 F0
+ C
+ C Define S = H >> 31 (asr), we then have
+ C
+ C F0 = H
+ C F1 = S - H
+ C F2 = - [H > 0]
+ C F3 = H - [H > 0]
+ C F4 = H + S
+ C
+ C And we get underflow in S - H iff H > 0
+
+ C H = 0 H > 0 H = -1
+ mov F0, H C 0 H -1
+ asr H, #31
+ subs F1, H, F0 C 0,C=1 -H,C=0 0,C=1
+ sbc F2, F2, F2 C 0 -1 0
+ sbc F3, F0, #0 C 0 H-1 -1
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, F0 C 0+cy H+cy -2+cy
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ ldm RP, {T0,T1,T2,T3} C 4-7
+
+ C F4 0 -F4
+ C ---------
+ C F3 F2 F1
+
+ rsbs F1, F4, #0
+ sbc F2, F2, F2
+ sbc F3, F4, #0
+
+ C Sign extend H
+ adds F0, F4, H
+ asr H, H, #31
+ adcs F1, F1, H
+ adcs F2, F2, H
+ adcs F3, F3, H
+ adcs F4, F4, H
+ adc H, H, #0
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+
+ stm RP!, {T0,T1,T2,T3} C 4-7
+ ldm RP, {T0,T1,T2,T3} C 8-11
+
+ adcs T0, T0, F4
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ adc H, H, #0
+
+ stm RP, {T0,T1,T2,T3} C 8-11
+
+ C Final (unlikely) carry
+ sub RP, RP, #32
+ ldm RP, {T0,T1,T2,T3} C 0-3
+ C Fold H into F0-F4
+ mov F0, H
+ asr H, #31
+ subs F1, H, F0
+ sbc F2, F2, F2
+ sbc F3, F0, #0
+ add F4, F0, H
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ ldm RP, {T0,T1,T2,T3} C 4-7
+ adcs T0, T0, F4
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ stm RP!, {T0,T1,T2,T3} C 4-7
+ ldm RP, {T0,T1,T2,T3} C 8-11
+ adcs T0, T0, H
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ stm RP!, {T0,T1,T2,T3} C 8-11
+ pop {r4,r5,r6,r7,r8,r10,pc}
+EPILOGUE(nettle_ecc_384_modp)
diff --git a/configure.ac b/configure.ac
index aa8817c5..64ca2397 100644
--- a/configure.ac
+++ b/configure.ac
@@ -250,7 +250,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
asm_optional_list=""
if test "x$enable_public_key" = "xyes" ; then
- asm_optional_list="$asm_search_list ecc-192-modp.asm ecc-224-modp.asm ecc-256-redc.asm"
+ asm_optional_list="ecc-192-modp.asm ecc-224-modp.asm ecc-256-redc.asm ecc-384-modp.asm"
fi
OPT_ASM_SOURCES=""
diff --git a/ecc-384.c b/ecc-384.c
index 6595b152..ffa19fec 100644
--- a/ecc-384.c
+++ b/ecc-384.c
@@ -36,13 +36,18 @@
#include "ecc-384.h"
+#if HAVE_NATIVE_ecc_384_modp
+#define ecc_384_modp nettle_ecc_384_modp
+void
+ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp);
+#elif GMP_NUMB_BITS == 32
+
/* Use that 2^{384} = 2^{128} + 2^{96} - 2^{32} + 1, and eliminate 256
bits at a time.
We can get carry == 2 in the first iteration, and I think *only* in
the first iteration. */
-#if GMP_NUMB_BITS == 32
/* p is 12 limbs, and B^12 - p = B^4 + B^3 - B + 1. We can eliminate
almost 8 at a time. Do only 7, to avoid additional carry
propagation, followed by 5. */