summaryrefslogtreecommitdiff
path: root/armv7
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-03-07 15:43:55 +0100
committerNiels Möller <nisse@lysator.liu.se>2013-03-07 15:43:55 +0100
commit32f3ba18f7c9e8715a759380afdc7bbe93d2542e (patch)
tree7a36c1a7dd3a56fd84d931624d8bd025f6bcdd47 /armv7
parent11609bf3647cc027f1d0c369c1a22b832acc0305 (diff)
parent33304507db5e8f1cb80b313b9d4128692b8ee385 (diff)
downloadnettle-32f3ba18f7c9e8715a759380afdc7bbe93d2542e.tar.gz
Merge branch 'ecc-support'.
Diffstat (limited to 'armv7')
-rw-r--r--armv7/README4
-rw-r--r--armv7/ecc-192-modp.asm93
-rw-r--r--armv7/ecc-224-modp.asm111
-rw-r--r--armv7/ecc-256-redc.asm160
-rw-r--r--armv7/ecc-384-modp.asm257
-rw-r--r--armv7/ecc-521-modp.asm114
6 files changed, 738 insertions, 1 deletions
diff --git a/armv7/README b/armv7/README
index 4d01f30b..9bacd97b 100644
--- a/armv7/README
+++ b/armv7/README
@@ -4,6 +4,8 @@ For efficient loads and stores, use ldmia, stmia and friends. Can do
two loads or stores per cycle with 8-byte aligned addresses, or three
loads or stores in two cycles, regardless of alignment.
+12 usable registers (if we exclude r9).
+
ABI gnueabi(hf) (not depending on the floating point conventions)
Registers May be Argument
@@ -23,7 +25,7 @@ r10 N
r11 N
r12 (ip) Y
r13 (sp)
-r14 (lr)
+r14 (lr) N
r15 (pc)
q0 (d0, d1) Y 1 (for "hf" abi)
diff --git a/armv7/ecc-192-modp.asm b/armv7/ecc-192-modp.asm
new file mode 100644
index 00000000..1b226e30
--- /dev/null
+++ b/armv7/ecc-192-modp.asm
@@ -0,0 +1,93 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-192-modp.asm"
+ .arm
+
+define(<HP>, <r0>) C Overlaps unused ecc argument
+define(<RP>, <r1>)
+
+define(<T0>, <r2>)
+define(<T1>, <r3>)
+define(<T2>, <r4>)
+define(<T3>, <r5>)
+define(<T4>, <r6>)
+define(<T5>, <r7>)
+define(<T6>, <r8>)
+define(<T7>, <r10>)
+define(<H0>, <T0>) C Overlaps T0 and T1
+define(<H1>, <T1>)
+define(<C2>, <HP>)
+define(<C4>, <r12>)
+
+ C ecc_192_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_192_modp)
+ push {r4,r5,r6,r7,r8,r10}
+ C Reduce two words at a time
+ add HP, RP, #48
+ add RP, RP, #8
+ ldmdb HP!, {H0,H1}
+ ldm RP, {T2,T3,T4,T5,T6,T7}
+ mov C4, #0
+ adds T4, T4, H0
+ adcs T5, T5, H1
+ adcs T6, T6, H0
+ adcs T7, T7, H1
+ C Need to add carry to T2 and T4, do T4 later.
+ adc C4, C4, #0
+
+ ldmdb HP!, {H0,H1}
+ mov C2, #0
+ adcs T2, T2, H0
+ adcs T3, T3, H1
+ adcs T4, T4, H0
+ adcs T5, T5, H1
+ C Need to add carry to T0 and T2, do T2 later
+ adc C2, C2, #0
+
+ ldmdb RP!, {T0, T1}
+ adcs T0, T0, T6
+ adcs T1, T1, T7
+ adcs T2, T2, T6
+ adcs T3, T3, T7
+ adc C4, C4, #0
+
+ adds T2, T2, C2
+ adcs T3, T3, #0
+ adcs T4, T4, C4
+ adcs T5, T5, #0
+ mov C2, #0
+ adc C2, C2, #0
+
+ C Add in final carry
+ adcs T0, T0, #0
+ adcs T1, T1, #0
+ adcs T2, T2, C2
+ adcs T3, T3, #0
+ adcs T4, T4, #0
+ adc T5, T5, #0
+
+ stm RP, {T0,T1,T2,T3,T4,T5}
+
+ pop {r4,r5,r6,r7,r8,r10}
+ bx lr
+EPILOGUE(nettle_ecc_192_modp)
diff --git a/armv7/ecc-224-modp.asm b/armv7/ecc-224-modp.asm
new file mode 100644
index 00000000..ef7a703a
--- /dev/null
+++ b/armv7/ecc-224-modp.asm
@@ -0,0 +1,111 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-224-modp.asm"
+ .arm
+
+define(<RP>, <r1>)
+define(<H>, <r0>) C Overlaps unused ecc argument
+
+define(<T0>, <r2>)
+define(<T1>, <r3>)
+define(<T2>, <r4>)
+define(<T3>, <r5>)
+define(<T4>, <r6>)
+define(<T5>, <r7>)
+define(<T6>, <r8>)
+define(<N3>, <r10>)
+define(<L0>, <r11>)
+define(<L1>, <r12>)
+define(<L2>, <lr>)
+
+ C ecc_224_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_224_modp)
+ push {r4,r5,r6,r7,r8,r10,r11,lr}
+
+ add L2, RP, #28
+ ldm L2, {T0,T1,T2,T3,T4,T5,T6}
+ mov H, #0
+
+ adds T0, T0, T4
+ adcs T1, T1, T5
+ adcs T2, T2, T6
+ adc H, H, #0
+
+ C This switch from adcs to sbcs takes carry into account with
+ C correct sign, but it always subtracts 1 too much. We arrange
+ C to also add B^7 + 1 below, so the effect is adding p. This
+ C addition of p also ensures that the result never is
+ C negative.
+
+ sbcs N3, T3, T0
+ sbcs T4, T4, T1
+ sbcs T5, T5, T2
+ sbcs T6, T6, H
+ mov H, #1 C This is the B^7
+ sbc H, #0
+ subs T6, T6, T3
+ sbc H, #0
+
+ C Now subtract from low half
+ ldm RP!, {L0,L1,L2}
+
+ C Clear carry, with the sbcs, this is the 1.
+ adds RP, #0
+
+ sbcs T0, L0, T0
+ sbcs T1, L1, T1
+ sbcs T2, L2, T2
+ ldm RP!, {T3,L0,L1,L2}
+ sbcs T3, T3, N3
+ sbcs T4, L0, T4
+ sbcs T5, L1, T5
+ sbcs T6, L2, T6
+ rsc H, H, #0
+
+ C Now -2 <= H <= 0 is the borrow, so subtract (B^3 - 1) |H|
+ C Use (B^3 - 1) H = <H, H, H> if -1 <=H <= 0, and
+ C (B^3 - 1) H = <1,B-1, B-1, B-2> if H = -2
+ subs T0, T0, H
+ asr L1, H, #1
+ sbcs T1, T1, L1
+ eor H, H, L1
+ sbcs T2, T2, L1
+ sbcs T3, T3, H
+ sbcs T4, T4, #0
+ sbcs T5, T5, #0
+ sbcs T6, T6, #0
+ sbcs H, H, H
+
+ C Final borrow, subtract (B^3 - 1) |H|
+ subs T0, T0, H
+ sbcs T1, T1, H
+ sbcs T2, T2, H
+ sbcs T3, T3, #0
+ sbcs T4, T4, #0
+ sbcs T5, T5, #0
+ sbcs T6, T6, #0
+
+ stmdb RP, {T0,T1,T2,T3,T4,T5,T6}
+
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+EPILOGUE(nettle_ecc_224_modp)
diff --git a/armv7/ecc-256-redc.asm b/armv7/ecc-256-redc.asm
new file mode 100644
index 00000000..cbf10a89
--- /dev/null
+++ b/armv7/ecc-256-redc.asm
@@ -0,0 +1,160 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-256-redc.asm"
+ .arm
+
+define(<RP>, <r1>)
+
+define(<T0>, <r0>) C Overlaps unused ecc argument
+define(<T1>, <r2>)
+define(<T2>, <r3>)
+define(<T3>, <r4>)
+define(<T4>, <r5>)
+define(<T5>, <r6>)
+define(<T6>, <r7>)
+define(<T7>, <r8>)
+define(<F0>, <r10>)
+define(<F1>, <r11>)
+define(<F2>, <r12>)
+define(<F3>, <lr>)
+
+ C ecc_256_redc (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_256_redc)
+ push {r4,r5,r6,r7,r8,r10,r11,lr}
+
+ ldm RP!, {T0,T1,T2,T3,T4,T5,T6,T7}
+
+ C Set <F3,F2,F1> to the high 4 limbs of (B^2-B+1)<T2,T1,T0>
+ C T2 T1
+ C T2 T1 T0
+ C - T2 T1 T0
+ C -------------
+ C F3 F2 F1 F0
+
+
+ adds F1, T0, T2
+ adcs F2, T1, #0
+ adc F3, T2, #0
+
+ subs F0, T1, T0
+ sbcs F1, F1, T1 C Could also be rsc ?
+ sbcs F2, F2, T2
+ sbc F3, F3, #0
+
+ C Add:
+ C T10 T9 T8 T7 T6 T5 T4 T3
+ C + F3 F2 F1 F0 T0 T2 T1 T0
+ C --------------------------
+ C T7 T6 T5 T4 T3 T2 T1 T0
+
+ adds T3, T3, T0
+ adcs T1, T4, T1
+ adcs T2, T5, T2
+ adcs T6, T6, T0
+ mov T0, T3 C FIXME: Be more clever?
+ mov T3, T6
+ adcs T4, T7, F0
+
+ ldm RP!, {T5,T6,T7}
+ adcs T5, T5, F1
+ adcs T6, T6, F2
+ adcs T7, T7, F3
+
+ C New F3, F2, F1, F0, also adding in carry
+ adcs F1, T0, T2
+ adcs F2, T1, #0
+ adc F3, T2, #0
+
+ subs F0, T1, T0
+ sbcs F1, F1, T1 C Could also be rsc ?
+ sbcs F2, F2, T2
+ sbc F3, F3, #0
+
+ C Start adding
+ adds T3, T3, T0
+ adcs T1, T4, T1
+ adcs T2, T5, T2
+ adcs T6, T6, T0
+ mov T0, T3 C FIXME: Be more clever?
+ mov T3, T6
+ adcs T4, T7, F0
+
+ ldm RP!, {T5,T6,T7}
+ adcs T5, T5, F1
+ adcs T6, T6, F2
+ adcs T7, T7, F3
+
+ C Final iteration, eliminate only T0, T1
+ C Set <F2, F1, F0> to the high 3 limbs of (B^2-B+1)<T1,T0>
+
+ C T1 T0 T1
+ C - T1 T0
+ C -------------
+ C F2 F1 F0
+
+ C First add in carry
+ adcs F1, T0, #0
+ adcs F2, T1, #0
+ subs F0, T1, T0
+ sbcs F1, F1, T1
+ sbc F2, F2, #0
+
+ C Add:
+ C T9 T8 T7 T6 T5 T4 T3 T2
+ C + F2 F1 F0 T0 0 T1 T0 0
+ C --------------------------
+ C F2 F1 T7 T6 T5 T4 T3 T2
+
+ adds T3, T3, T0
+ adcs T4, T4, T1
+ adcs T5, T5, #0
+ adcs T6, T6, T0
+ adcs T7, T7, F0
+ ldm RP!, {T0, T1}
+ mov F3, #0
+ adcs F1, F1, T0
+ adcs F2, F2, T1
+
+ C Sum is < B^8 + p, so it's enough to fold carry once,
+ C If carry, add in
+ C B^7 - B^6 - B^3 + 1 = <0, B-2, B-1, B-1, B-1, 0, 0, 1>
+
+ C Mask from carry flag, leaving carry intact
+ adc F3, F3, #0
+ rsb F3, F3, #0
+
+ adcs T0, T2, #0
+ adcs T1, T3, #0
+ adcs T2, T4, #0
+ adcs T3, T5, F3
+ adcs T4, T6, F3
+ adcs T5, T7, F3
+ and F3, F3, #-2
+ adcs T6, F1, F3
+ adcs T7, F2, #0
+
+ sub RP, RP, #64
+ stm RP, {T0,T1,T2,T3,T4,T5,T6,T7}
+
+ pop {r4,r5,r6,r7,r8,r10,r11,pc}
+EPILOGUE(nettle_ecc_256_redc)
diff --git a/armv7/ecc-384-modp.asm b/armv7/ecc-384-modp.asm
new file mode 100644
index 00000000..fb5a6e12
--- /dev/null
+++ b/armv7/ecc-384-modp.asm
@@ -0,0 +1,257 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-384-modp.asm"
+ .arm
+
+define(<RP>, <r1>)
+define(<T0>, <r0>)
+define(<T1>, <r2>)
+define(<T2>, <r3>)
+define(<T3>, <r4>)
+define(<F0>, <r5>)
+define(<F1>, <r6>)
+define(<F2>, <r7>)
+define(<F3>, <r8>)
+define(<F4>, <r10>)
+define(<N>, <r12>)
+define(<H>, <lr>)
+
+ C ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+ .align 2
+
+PROLOGUE(nettle_ecc_384_modp)
+ push {r4,r5,r6,r7,r8,r10,lr}
+
+ add RP, RP, #80
+ ldm RP, {T0, T1, T2, T3} C 20-23
+
+ C First get top 4 limbs, which need folding twice, as
+ C
+ C T3 T2 T1 T0
+ C T3 T2 T1
+ C -T3
+ C ----------------
+ C F4 F3 F2 F1 F0
+ C
+ C Start with
+ C
+ C T3 T1 T0
+ C T1
+ C -T3
+ C -----------
+ C F2 F1 F0 Always fits
+
+ adds F0, T0, T1
+ adcs F1, T1, #0
+ adcs F2, T3, #0
+ subs F0, F0, T3
+ sbcs F1, F1, #0
+ sbcs F2, F2, #0
+
+ C T3 T2 T2 0
+ C F2 F1 F0
+ C ----------------
+ C F4 F3 F2 F1 F0
+
+ mov F4, #0
+ adds F1, F1, T2
+ adcs F2, F2, T2
+ adcs F3, T3, #0
+ adcs F4, F4, #0
+
+ C Add in to high part
+ sub RP, RP, #32
+ ldm RP, {T0, T1, T2, T3} C 12-15
+ mov H, #0
+ adds F0, T0, F0
+ adcs F1, T1, F1
+ adcs F2, T2, F2
+ adcs F3, T3, F3
+ adcs F4, F4, #0 C Do F4 later
+
+ C Add to low part, keeping carry (positive or negative) in H
+ sub RP, RP, #48
+ ldm RP, {T0, T1, T2, T3} C 0-3
+ mov H, #0
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+ subs T1, T1, F0
+ sbcs T2, T2, F1
+ sbcs T3, T3, F2
+ sbc H, H, #0
+ adds T3, T3, F0
+ adc H, H, #0
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ mov N, #2
+.Loop:
+ ldm RP, {T0,T1,T2,T3} C 4-7
+
+ C First, propagate carry
+ adds T0, T0, H
+ asr H, #31 C Sign extend
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ adc H, H, #0
+
+ C +B^4 term
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+
+ C +B^3 terms
+ ldr F0, [RP, #+48] C 16
+ adds T0, T0, F1
+ adcs T1, T1, F2
+ adcs T2, T2, F3
+ adcs T3, T3, F0
+ adc H, H, #0
+
+ C -B
+ ldr F1, [RP, #+52] C 17-18
+ ldr F2, [RP, #+56]
+ subs T0, T0, F3
+ sbcs T1, T1, F0
+ sbcs T2, T2, F1
+ sbcs T3, T3, F2
+ sbcs H, H, #0
+
+ C +1
+ ldr F3, [RP, #+60] C 19
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, #0
+ subs N, N, #1
+ stm RP!, {T0,T1,T2,T3}
+ bne .Loop
+
+ C Fold high limbs, we need to add in
+ C
+ C F4 F4 0 -F4 F4 H H 0 -H H
+ C
+ C We always have F4 >= 0, but we can have H < 0.
+ C Sign extension gets tricky when F4 = 0 and H < 0.
+ sub RP, RP, #48
+
+ ldm RP, {T0,T1,T2,T3} C 0-3
+
+ C H H 0 -H H
+ C ----------------
+ C S H F3 F2 F1 F0
+ C
+ C Define S = H >> 31 (asr), we then have
+ C
+ C F0 = H
+ C F1 = S - H
+ C F2 = - [H > 0]
+ C F3 = H - [H > 0]
+ C H = H + S
+ C
+ C And we get underflow in S - H iff H > 0
+
+ C H = 0 H > 0 H = -1
+ mov F0, H C 0 H -1
+ asr H, #31
+ subs F1, H, F0 C 0,C=1 -H,C=0 0,C=1
+ sbc F2, F2, F2 C 0 -1 0
+ sbc F3, F0, #0 C 0 H-1 -1
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+ adc H, H, F0 C 0+cy H+cy -2+cy
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ ldm RP, {T0,T1,T2,T3} C 4-7
+
+ C F4 0 -F4
+ C ---------
+ C F3 F2 F1
+
+ rsbs F1, F4, #0
+ sbc F2, F2, F2
+ sbc F3, F4, #0
+
+ C Sign extend H
+ adds F0, F4, H
+ asr H, H, #31
+ adcs F1, F1, H
+ adcs F2, F2, H
+ adcs F3, F3, H
+ adcs F4, F4, H
+ adc H, H, #0
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+
+ stm RP!, {T0,T1,T2,T3} C 4-7
+ ldm RP, {T0,T1,T2,T3} C 8-11
+
+ adcs T0, T0, F4
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ adc H, H, #0
+
+ stm RP, {T0,T1,T2,T3} C 8-11
+
+ C Final (unlikely) carry
+ sub RP, RP, #32
+ ldm RP, {T0,T1,T2,T3} C 0-3
+ C Fold H into F0-F4
+ mov F0, H
+ asr H, #31
+ subs F1, H, F0
+ sbc F2, F2, F2
+ sbc F3, F0, #0
+ add F4, F0, H
+
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ adcs T3, T3, F3
+
+ stm RP!, {T0,T1,T2,T3} C 0-3
+ ldm RP, {T0,T1,T2,T3} C 4-7
+ adcs T0, T0, F4
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ stm RP!, {T0,T1,T2,T3} C 4-7
+ ldm RP, {T0,T1,T2,T3} C 8-11
+ adcs T0, T0, H
+ adcs T1, T1, H
+ adcs T2, T2, H
+ adcs T3, T3, H
+ stm RP!, {T0,T1,T2,T3} C 8-11
+ pop {r4,r5,r6,r7,r8,r10,pc}
+EPILOGUE(nettle_ecc_384_modp)
diff --git a/armv7/ecc-521-modp.asm b/armv7/ecc-521-modp.asm
new file mode 100644
index 00000000..fe305805
--- /dev/null
+++ b/armv7/ecc-521-modp.asm
@@ -0,0 +1,114 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-521-modp.asm"
+ .arm
+
+define(<HP>, <r0>)
+define(<RP>, <r1>)
+define(<T0>, <r2>)
+define(<T1>, <r3>)
+define(<T2>, <r4>)
+define(<F0>, <r5>)
+define(<F1>, <r6>)
+define(<F2>, <r7>)
+define(<F3>, <r8>)
+define(<H>, <r12>)
+define(<N>, <lr>)
+
+ C ecc_521_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
+ .text
+.Lc511:
+ .int 511
+
+ .align 2
+
+PROLOGUE(nettle_ecc_521_modp)
+ push {r4,r5,r6,r7,r8,lr}
+
+ C Use that B^17 = 2^23 (mod p)
+ ldr F3, [RP, #+68] C 17
+ add HP, RP, #72 C 18
+ ldr T0, [RP] C 0
+ adds T0, T0, F3, lsl #23
+ str T0, [RP], #+4
+ mov N, #5
+
+ C 5 iterations, reading limbs 18-20, 21-23, 24-26, 27-29, 30-32
+ C and adding to limbs 1-3, 4-6, 7-9, 19-12, 13-15
+.Loop:
+ ldm RP, {T0,T1,T2} C 1+3*k -- 3+3*k
+ lsr F0, F3, #9
+ ldm HP!, {F1,F2,F3} C 18+3*k -- 20+3*k
+ orr F0, F0, F1, lsl #23
+ lsr F1, F1, #9
+ orr F1, F1, F2, lsl #23
+ lsr F2, F2, #9
+ orr F2, F2, F3, lsl #23
+ adcs T0, T0, F0
+ adcs T1, T1, F1
+ adcs T2, T2, F2
+ sub N, N, #1
+ stm RP!,{T0,T1,T2}
+ teq N, #0
+ bne .Loop
+
+ ldr F0, [RP], #-64 C 16
+ ldr F1, [HP] C 33
+ ldr T0, .Lc511
+
+ C Handling of high limbs
+ C F0 = rp[16] + carry in + F3 >> 9
+ adcs F0, F0, F3, lsr #9
+ C Copy low 9 bits to H, then shift right including carry
+ and H, F0, T0
+ rrx F0, F0
+ lsr F0, F0, #8
+ C Add in F1 = rp[33], with weight 2^1056 = 2^14
+ adds F0, F0, F1, lsl #14
+ lsr F1, F1, #18
+ adc F1, F1, #0
+
+ ldm RP, {T0, T1} C 0-1
+ adds T0, T0, F0
+ adcs T1, T1, F1
+ stm RP!, {T0, T1}
+
+ ldm RP, {T0,T1,T2,F0,F1,F2,F3} C 2-8
+ adcs T0, T0, #0
+ adcs T1, T1, #0
+ adcs T2, T2, #0
+ adcs F0, F0, #0
+ adcs F1, F1, #0
+ adcs F2, F2, #0
+ adcs F3, F3, #0
+ stm RP!, {T0,T1,T2,F0,F1,F2,F3} C 2-8
+ ldm RP, {T0,T1,T2,F0,F1,F2,F3} C 9-15
+ adcs T0, T0, #0
+ adcs T1, T1, #0
+ adcs T2, T2, #0
+ adcs F0, F0, #0
+ adcs F1, F1, #0
+ adcs F2, F2, #0
+ adcs F3, F3, #0
+ adcs H, H, #0
+ stm RP, {T0,T1,T2,F0,F1,F2,F3,H} C 9-16
+
+ pop {r4,r5,r6,r7,r8,pc}
+EPILOGUE(nettle_ecc_521_modp)