summaryrefslogtreecommitdiff
path: root/third_party/unacl-curve25519/core
diff options
context:
space:
mode:
authorNicolas Boichat <drinkcat@chromium.org>2019-05-09 13:35:30 +0900
committerCommit Bot <commit-bot@chromium.org>2020-03-25 02:13:31 +0000
commitc830ecc1f728b722fde24a5da14a89f9223f291c (patch)
treef60b011c069bec340f3eaef1f835a88ca4a70fe7 /third_party/unacl-curve25519/core
parentd1f7ca6f287eeeeba0d397eed87473415276ac6b (diff)
downloadchrome-ec-c830ecc1f728b722fde24a5da14a89f9223f291c.tar.gz
core/cortex-m0/curve25519: Move code to third_party folder
Also, add LICENSE file (some files are under CC0, some are public domain), and METADATA file. BRANCH=none BUG=chromium:884905 TEST=make buildall -j, which also include basic tests. Change-Id: Ib3a7eb9245a0634c4052064c3e36cbe2ddafbcb9 Signed-off-by: Nicolas Boichat <drinkcat@chromium.org> Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/ec/+/1599761 Reviewed-by: Aseda Aboagye <aaboagye@chromium.org>
Diffstat (limited to 'third_party/unacl-curve25519/core')
-rw-r--r--third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S181
-rw-r--r--third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S1111
-rw-r--r--third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S163
-rw-r--r--third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c588
-rw-r--r--third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S1164
5 files changed, 3207 insertions, 0 deletions
diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S
new file mode 100644
index 0000000000..d2a467459b
--- /dev/null
+++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S
@@ -0,0 +1,181 @@
+// Implementation of multiplication of an fe25519 bit value with the curve constant 121666.
+//
+// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG
+// public domain.
+//
+// gnu assembler format.
+//
+// Generated and tested with C++ functions in the test subdirectory.
+//
+// ATTENTION:
+// Not yet tested on target hardware.
+
+
+ .code 16
+ .text
+ .align 2
+
+ .global fe25519_mpyWith121666_asm
+ .code 16
+ .thumb_func
+ .type fe25519_mpyWith121666_asm, %function
+
+fe25519_mpyWith121666_asm:
+ push {r4,r5,r6,r7,r14}
+ ldr r7,=56130
+ ldr r2,[r1,#28]
+ lsl r5,r2,#16
+ lsr r6,r2,#16
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ lsl r2,r5,#1
+ lsr r2,r2,#1
+ str r2,[r0,#28]
+ lsr r5,r5,#31
+ lsl r6,r6,#1
+ orr r5,r6
+ mov r6,#19
+ mul r5,r6
+ mov r6,#0
+ ldr r2,[r1,#0]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#0]
+ mov r5,#0
+ ldr r2,[r1,#4]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r6,r3
+ adc r5,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r6,r2
+ mov r2,#0
+ adc r5,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r6,r2
+ adc r5,r3
+ str r6,[r0,#4]
+ mov r6,#0
+ ldr r2,[r1,#8]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#8]
+ mov r5,#0
+ ldr r2,[r1,#12]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r6,r3
+ adc r5,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r6,r2
+ mov r2,#0
+ adc r5,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r6,r2
+ adc r5,r3
+ str r6,[r0,#12]
+ mov r6,#0
+ ldr r2,[r1,#16]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#16]
+ mov r5,#0
+ ldr r2,[r1,#20]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r6,r3
+ adc r5,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r6,r2
+ mov r2,#0
+ adc r5,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r6,r2
+ adc r5,r3
+ str r6,[r0,#20]
+ mov r6,#0
+ ldr r2,[r1,#24]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#24]
+ mov r5,#0
+ ldr r2,[r0,#28]
+ add r6,r2
+ str r6,[r0,#28]
+ pop {r4,r5,r6,r7,r15}
+
+ .size fe25519_mpyWith121666_asm, .-fe25519_mpyWith121666_asm
+
diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S
new file mode 100644
index 0000000000..366713a7a3
--- /dev/null
+++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S
@@ -0,0 +1,1111 @@
+ .align 2
+ .global multiply256x256_asm
+ .type multiply256x256_asm, %function
+multiply256x256_asm:
+ push {r4-r7,lr}
+ mov r3, r8
+ mov r4, r9
+ mov r5, r10
+ mov r6, r11
+ push {r0-r6}
+ mov r12, r0
+ mov r10, r2
+ mov r11, r1
+ mov r0,r2
+ //ldm r0!, {r4,r5,r6,r7}
+ ldm r0!, {r4,r5}
+ add r0,#8
+ ldm r1!, {r2,r3,r6,r7}
+ push {r0,r1}
+ /////////BEGIN LOW PART //////////////////////
+ /////////MUL128/////////////
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ //////////////////////////
+ mov r4, r12
+ stm r4!, {r0,r1}
+ push {r4}
+ push {r0,r1}
+ mov r1, r10
+ mov r10, r2
+ ldm r1, {r0, r1, r4, r5}
+ mov r2, r4
+ mov r7, r5
+ sub r2, r0
+ sbc r7, r1
+ sbc r6, r6
+ eor r2, r6
+ eor r7, r6
+ sub r2, r6
+ sbc r7, r6
+ push {r2, r7}
+ mov r2, r11
+ mov r11, r3
+ ldm r2, {r0, r1, r2, r3}
+ sub r0, r2
+ sbc r1, r3
+ sbc r7, r7
+ eor r0, r7
+ eor r1, r7
+ sub r0, r7
+ sbc r1, r7
+ eor r7, r6
+ mov r12, r7
+ push {r0, r1}
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ mov r4, r10
+ mov r5, r11
+ eor r6, r6
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r6
+ mov r10, r2
+ mov r11, r3
+ pop {r2-r5}
+ push {r0, r1}
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ pop {r4, r5}
+ mov r6, r12
+ mov r7, r12
+ eor r0, r6
+ eor r1, r6
+ eor r2, r6
+ eor r3, r6
+ asr r6, r6, #1
+ adc r0, r4
+ adc r1, r5
+ adc r4, r2
+ adc r5, r3
+ eor r2, r2
+ adc r6,r2
+ adc r7,r2
+ pop {r2, r3}
+ mov r8, r2
+ mov r9, r3
+ add r2, r0
+ adc r3, r1
+ mov r0, r10
+ mov r1, r11
+ adc r4, r0
+ adc r5, r1
+ adc r6, r0
+ adc r7, r1
+ ////////END LOW PART/////////////////////
+ pop {r0}
+ stm r0!, {r2,r3}
+ pop {r1,r2}
+ push {r0}
+ push {r4-r7}
+ mov r10, r1
+ mov r11, r2
+ ldm r1!, {r4, r5}
+ ldm r2, {r2, r3}
+ /////////BEGIN HIGH PART////////////////
+ /////////MUL128/////////////
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ push {r0,r1}
+ mov r1, r10
+ mov r10, r2
+ ldm r1, {r0, r1, r4, r5}
+ mov r2, r4
+ mov r7, r5
+ sub r2, r0
+ sbc r7, r1
+ sbc r6, r6
+ eor r2, r6
+ eor r7, r6
+ sub r2, r6
+ sbc r7, r6
+ push {r2, r7}
+ mov r2, r11
+ mov r11, r3
+ ldm r2, {r0, r1, r2, r3}
+ sub r0, r2
+ sbc r1, r3
+ sbc r7, r7
+ eor r0, r7
+ eor r1, r7
+ sub r0, r7
+ sbc r1, r7
+ eor r7, r6
+ mov r12, r7
+ push {r0, r1}
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ mov r4, r10
+ mov r5, r11
+ eor r6, r6
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r6
+ mov r10, r2
+ mov r11, r3
+ pop {r2-r5}
+ push {r0, r1}
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ pop {r4, r5}
+ mov r6, r12
+ mov r7, r12
+ eor r0, r6
+ eor r1, r6
+ eor r2, r6
+ eor r3, r6
+ asr r6, r6, #1
+ adc r0, r4
+ adc r1, r5
+ adc r4, r2
+ adc r5, r3
+ eor r2, r2
+ adc r6,r2 //0,1
+ adc r7,r2
+ pop {r2, r3}
+ mov r8, r2
+ mov r9, r3
+ add r2, r0
+ adc r3, r1
+ mov r0, r10
+ mov r1, r11
+ adc r4, r0
+ adc r5, r1
+ adc r6, r0
+ adc r7, r1
+ ////////END HIGH PART/////////////////////
+ mov r0, r8
+ mov r1, r9
+ mov r8, r6
+ mov r9, r7
+ pop {r6, r7}
+ add r0, r6
+ adc r1, r7
+ pop {r6, r7}
+ adc r2, r6
+ adc r3, r7
+ pop {r7}
+ stm r7!, {r0-r3}
+ mov r10, r7
+ eor r0,r0
+ mov r6, r8
+ mov r7, r9
+ adc r4, r0
+ adc r5, r0
+ adc r6, r0
+ adc r7, r0
+ pop {r0,r1,r2}
+ mov r12, r2
+ push {r0, r4-r7}
+ ldm r1, {r0-r7}
+ sub r0, r4
+ sbc r1, r5
+ sbc r2, r6
+ sbc r3, r7
+ eor r4, r4
+ sbc r4, r4
+ eor r0, r4
+ eor r1, r4
+ eor r2, r4
+ eor r3, r4
+ sub r0, r4
+ sbc r1, r4
+ sbc r2, r4
+ sbc r3, r4
+ mov r6, r12
+ mov r12, r4 //carry
+ mov r5, r10
+ stm r5!, {r0-r3}
+ mov r11, r5
+ mov r8, r0
+ mov r9, r1
+ ldm r6, {r0-r7}
+ sub r4, r0
+ sbc r5, r1
+ sbc r6, r2
+ sbc r7, r3
+ eor r0, r0
+ sbc r0, r0
+ eor r4, r0
+ eor r5, r0
+ eor r6, r0
+ eor r7, r0
+ sub r4, r0
+ sbc r5, r0
+ sbc r6, r0
+ sbc r7, r0
+ mov r1, r12
+ eor r0, r1
+ mov r1, r11
+ stm r1!, {r4-r7}
+ push {r0}
+ mov r2, r8
+ mov r3, r9
+ /////////BEGIN MIDDLE PART////////////////
+ /////////MUL128/////////////
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ push {r0,r1}
+ mov r1, r10
+ mov r10, r2
+ ldm r1, {r0, r1, r4, r5}
+ mov r2, r4
+ mov r7, r5
+ sub r2, r0
+ sbc r7, r1
+ sbc r6, r6
+ eor r2, r6
+ eor r7, r6
+ sub r2, r6
+ sbc r7, r6
+ push {r2, r7}
+ mov r2, r11
+ mov r11, r3
+ ldm r2, {r0, r1, r2, r3}
+ sub r0, r2
+ sbc r1, r3
+ sbc r7, r7
+ eor r0, r7
+ eor r1, r7
+ sub r0, r7
+ sbc r1, r7
+ eor r7, r6
+ mov r12, r7
+ push {r0, r1}
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ mov r4, r10
+ mov r5, r11
+ eor r6, r6
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r6
+ mov r10, r2
+ mov r11, r3
+ pop {r2-r5}
+ push {r0, r1}
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ pop {r4, r5}
+ mov r6, r12
+ mov r7, r12
+ eor r0, r6
+ eor r1, r6
+ eor r2, r6
+ eor r3, r6
+ asr r6, r6, #1
+ adc r0, r4
+ adc r1, r5
+ adc r4, r2
+ adc r5, r3
+ eor r2, r2
+ adc r6,r2 //0,1
+ adc r7,r2
+ pop {r2, r3}
+ mov r8, r2
+ mov r9, r3
+ add r2, r0
+ adc r3, r1
+ mov r0, r10
+ mov r1, r11
+ adc r4, r0
+ adc r5, r1
+ adc r6, r0
+ adc r7, r1
+ //////////END MIDDLE PART////////////////
+ pop {r0,r1} //r0,r1
+ mov r12, r0 //negative
+ eor r2, r0
+ eor r3, r0
+ eor r4, r0
+ eor r5, r0
+ eor r6, r0
+ eor r7, r0
+ push {r4-r7}
+ ldm r1!, {r4-r7}
+ mov r11, r1 //reference
+ mov r1, r9
+ eor r1, r0
+ mov r10, r4
+ mov r4, r8
+ asr r0, #1
+ eor r0, r4
+ mov r4, r10
+ adc r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ eor r4, r4
+ adc r4, r4
+ mov r10, r4 //carry
+ mov r4, r11
+ ldm r4, {r4-r7}
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ mov r9, r4
+ mov r4, r11
+ stm r4!, {r0-r3}
+ mov r11, r4
+ pop {r0-r3}
+ mov r4, r9
+ adc r4, r0
+ adc r5, r1
+ adc r6, r2
+ adc r7, r3
+ mov r1, #0
+ adc r1, r1
+ mov r0, r10
+ mov r10, r1 //carry
+ asr r0, #1
+ pop {r0-r3}
+ adc r4, r0
+ adc r5, r1
+ adc r6, r2
+ adc r7, r3
+ mov r8, r0
+ mov r0, r11
+ stm r0!, {r4-r7}
+ mov r11, r0
+ mov r0, r8
+ mov r6, r12
+ mov r5, r10
+ eor r4, r4
+ adc r5, r6
+ adc r6, r4
+ add r0, r5
+ adc r1, r6
+ adc r2, r6
+ adc r3, r6
+ mov r7, r11
+ stm r7!, {r0-r3}
+ pop {r3-r6}
+ mov r8, r3
+ mov r9, r4
+ mov r10, r5
+ mov r11, r6
+ pop {r4-r7,pc}
+ bx lr
+.size multiply256x256_asm, .-multiply256x256_asm
+
diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S
new file mode 100644
index 0000000000..9a3c29a0f6
--- /dev/null
+++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S
@@ -0,0 +1,163 @@
+// Implementation of a partial reduction modulo 2^255 - 38.
+//
+// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG
+// public domain.
+//
+// gnu assembler format.
+//
+// Generated and tested with C++ functions in the test subdirectory and on the target.
+//
+
+ .code 16
+
+ .text
+ .align 2
+
+ .global fe25519_reduceTo256Bits_asm
+ .code 16
+ .thumb_func
+ .type fe25519_reduceTo256Bits_asm, %function
+
+fe25519_reduceTo256Bits_asm:
+ push {r4,r5,r6,r7,r14}
+ ldr r2,[r1,#60]
+ lsr r3,r2,#16
+ uxth r2,r2
+ mov r7,#38
+ mul r2,r7
+ mul r3,r7
+ ldr r4,[r1,#28]
+ lsr r5,r3,#16
+ lsl r3,r3,#16
+ mov r6,#0
+ add r4,r2
+ adc r5,r6
+ add r4,r3
+ adc r5,r6
+ lsl r2,r4,#1
+ lsr r2,r2,#1
+ str r2,[r0,#28]
+ lsr r4,r4,#31
+ lsl r5,r5,#1
+ orr r4,r5
+ mov r2,#19
+ mul r2,r4
+ ldr r4,[r1,#0]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#32]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#0]
+ ldr r4,[r1,#4]
+ add r3,r4
+ mov r2,#0
+ adc r2,r6
+ ldr r4,[r1,#36]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r3,r4
+ adc r2,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r3,r4
+ adc r2,r5
+ str r3,[r0,#4]
+ ldr r4,[r1,#8]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#40]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#8]
+ ldr r4,[r1,#12]
+ add r3,r4
+ mov r2,#0
+ adc r2,r6
+ ldr r4,[r1,#44]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r3,r4
+ adc r2,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r3,r4
+ adc r2,r5
+ str r3,[r0,#12]
+ ldr r4,[r1,#16]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#48]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#16]
+ ldr r4,[r1,#20]
+ add r3,r4
+ mov r2,#0
+ adc r2,r6
+ ldr r4,[r1,#52]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r3,r4
+ adc r2,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r3,r4
+ adc r2,r5
+ str r3,[r0,#20]
+ ldr r4,[r1,#24]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#56]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#24]
+ ldr r4,[r0,#28]
+ add r4,r3
+ str r4,[r0,#28]
+ pop {r4,r5,r6,r7,r15}
+
+ .size fe25519_reduceTo256Bits_asm, .-fe25519_reduceTo256Bits_asm
+
diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c b/third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c
new file mode 100644
index 0000000000..07e2b144e7
--- /dev/null
+++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c
@@ -0,0 +1,588 @@
+/* =======================
+ ============================ C/C++ HEADER FILE =============================
+ =======================
+
+ Collection of all required submodules from naclM0 required for curve25519
+ scalar multiplication (not including randomization, etc.) alone.
+
+ Library naclM0 largely bases on work avrNacl of M. Hutter and P. Schwabe.
+
+ Will compile to the two functions
+
+ int
+ crypto_scalarmult_base_curve25519(
+ unsigned char* q,
+ const unsigned char* n
+ );
+
+ int
+ crypto_scalarmult_curve25519 (
+ unsigned char* r,
+ const unsigned char* s,
+ const unsigned char* p
+ );
+
+ Requires inttypes.h header and the four external assembly functions
+
+ extern void
+ fe25519_reduceTo256Bits_asm (
+ fe25519 *res,
+ const UN_512bitValue *in
+ );
+
+ extern void
+ fe25519_mpyWith121666_asm (
+ fe25519* out,
+ const fe25519* in
+ );
+
+ extern void
+ multiply256x256_asm (
+ UN_512bitValue* result,
+ const UN_256bitValue* x,
+ const UN_256bitValue* y
+ );
+
+ extern void
+ square256_asm (
+ UN_512bitValue* result,
+ const UN_256bitValue* x
+ );
+
+ \file scalarmult.c
+
+ \Author B. Haase, Endress + Hauser Conducta GmbH & Co. KG
+
+ Distributed under the conditions of the
+ Creative Commons CC0 1.0 Universal public domain dedication
+ ============================================================================*/
+
+#include "curve25519.h"
+#include "util.h"
+
+typedef uint8_t uint8;
+typedef uint16_t uint16;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef uintptr_t uintptr;
+
+typedef int8_t int8;
+typedef int16_t int16;
+typedef int32_t int32;
+typedef int64_t int64;
+typedef intptr_t intptr;
+
+// Note that it's important to define the unit8 as first union member, so that
+// an array of uint8 may be used as initializer.
+typedef union UN_256bitValue_
+{
+ uint8 as_uint8[32];
+ uint16 as_uint16[16];
+ uint32 as_uint32[8];
+ uint64 as_uint64[4];
+} UN_256bitValue;
+
+// Note that it's important to define the unit8 as first union member, so that
+// an array of uint8 may be used as initializer.
+typedef union UN_512bitValue_
+{
+ uint8 as_uint8[64];
+ uint16 as_uint16[32];
+ uint32 as_uint32[16];
+ uint64 as_uint64[8];
+ UN_256bitValue as_256_bitValue[2];
+} UN_512bitValue;
+
+typedef UN_256bitValue fe25519;
+
+// ****************************************************
+// Assembly functions.
+// ****************************************************
+
+extern void
+fe25519_reduceTo256Bits_asm(
+ fe25519 *res,
+ const UN_512bitValue *in
+);
+
+#define fe25519_mpyWith121666 fe25519_mpyWith121666_asm
+extern void
+fe25519_mpyWith121666_asm (
+ fe25519* out,
+ const fe25519* in
+);
+
+#define multiply256x256 multiply256x256_asm
+extern void
+multiply256x256(
+ UN_512bitValue* result,
+ const UN_256bitValue* x,
+ const UN_256bitValue* y
+);
+
+#define square256 square256_asm
+extern void
+square256(
+ UN_512bitValue* result,
+ const UN_256bitValue* x
+);
+
+// ****************************************************
+// C functions for fe25519
+// ****************************************************
+
+static void
+fe25519_cpy(
+ fe25519* dest,
+ const fe25519* source
+)
+{
+ memcpy(dest, source, 32);
+}
+
+static void
+fe25519_unpack(
+ fe25519* out,
+ const unsigned char in[32]
+)
+{
+ memcpy(out, in, 32);
+
+ out->as_uint8[31] &= 0x7f; // make sure that the last bit is cleared.
+}
+
+static void
+fe25519_sub(
+ fe25519* out,
+ const fe25519* baseValue,
+ const fe25519* valueToSubstract
+)
+{
+ uint16 ctr;
+ int64 accu = 0;
+
+ // First subtract the most significant word, so that we may
+ // reduce the result "on the fly".
+ accu = baseValue->as_uint32[7];
+ accu -= valueToSubstract->as_uint32[7];
+
+ // We always set bit #31, and compensate this by subtracting 1 from the reduction
+ // value.
+ out->as_uint32[7] = ((uint32)accu) | 0x80000000ul;
+
+ accu = 19 * ((int32)(accu >> 31) - 1);
+ // ^ "-1" is the compensation for the "| 0x80000000ul" above.
+ // This choice makes sure, that the result will be positive!
+
+ for (ctr = 0; ctr < 7; ctr += 1)
+ {
+ accu += baseValue->as_uint32[ctr];
+ accu -= valueToSubstract->as_uint32[ctr];
+
+ out->as_uint32[ctr] = (uint32)accu;
+ accu >>= 32;
+ }
+ accu += out->as_uint32[7];
+ out->as_uint32[7] = (uint32)accu;
+}
+
+static void
+fe25519_add(
+ fe25519* out,
+ const fe25519* baseValue,
+ const fe25519* valueToAdd
+)
+{
+ uint16 ctr = 0;
+ uint64 accu = 0;
+
+ // We first add the most significant word, so that we may reduce
+ // "on the fly".
+ accu = baseValue->as_uint32[7];
+ accu += valueToAdd->as_uint32[7];
+ out->as_uint32[7] = ((uint32)accu) & 0x7ffffffful;
+
+ accu = ((uint32)(accu >> 31)) * 19;
+
+ for (ctr = 0; ctr < 7; ctr += 1)
+ {
+ accu += baseValue->as_uint32[ctr];
+ accu += valueToAdd->as_uint32[ctr];
+
+ out->as_uint32[ctr] = (uint32)accu;
+ accu >>= 32;
+ }
+ accu += out->as_uint32[7];
+ out->as_uint32[7] = (uint32)accu;
+}
+
+static void
+fe25519_mul(
+ fe25519* result,
+ const fe25519* in1,
+ const fe25519* in2
+)
+{
+ UN_512bitValue tmp;
+
+ multiply256x256(&tmp, in1, in2);
+ fe25519_reduceTo256Bits_asm(result,&tmp);
+}
+
+static void
+fe25519_square(
+ fe25519* result,
+ const fe25519* in
+)
+{
+ UN_512bitValue tmp;
+
+ square256(&tmp, in);
+ fe25519_reduceTo256Bits_asm(result,&tmp);
+}
+
+static void
+fe25519_reduceCompletely(
+ fe25519* inout
+)
+{
+ uint32 numberOfTimesToSubstractPrime;
+ uint32 initialGuessForNumberOfTimesToSubstractPrime = inout->as_uint32[7] >>
+ 31;
+ uint64 accu;
+ uint8 ctr;
+
+ // add one additional 19 to the estimated number of reductions.
+ // Do the calculation without writing back the results to memory.
+ //
+ // The initial guess of required numbers of reductions is based
+ // on bit #32 of the most significant word.
+ // This initial guess may be wrong, since we might have a value
+ // v in the range
+ // 2^255 - 19 <= v < 2^255
+ // . After adding 19 to the value, we will be having the correct
+ // Number of required subtractions.
+ accu = initialGuessForNumberOfTimesToSubstractPrime * 19 + 19;
+
+ for (ctr = 0; ctr < 7; ctr++)
+ {
+ accu += inout->as_uint32[ctr];
+ accu >>= 32;
+ }
+ accu += inout->as_uint32[7];
+
+ numberOfTimesToSubstractPrime = (uint32)(accu >> 31);
+
+ // Do the reduction.
+ accu = numberOfTimesToSubstractPrime * 19;
+
+ for (ctr = 0; ctr < 7; ctr++)
+ {
+ accu += inout->as_uint32[ctr];
+ inout->as_uint32[ctr] = (uint32)accu;
+ accu >>= 32;
+ }
+ accu += inout->as_uint32[7];
+ inout->as_uint32[7] = accu & 0x7ffffffful;
+}
+
+/// We are already using a packed radix 16 representation for fe25519. The real use for this function
+/// is for architectures that use more bits for storing a fe25519 in a representation where multiplication
+/// may be calculated more efficiently.
+/// Here we simply copy the data.
+static void
+fe25519_pack(
+ unsigned char out[32],
+ fe25519* in
+)
+{
+ fe25519_reduceCompletely(in);
+
+ memcpy(out, in, 32);
+}
+
+// Note, that r and x are allowed to overlap!
+static void
+fe25519_invert_useProvidedScratchBuffers(
+ fe25519* r,
+ const fe25519* x,
+ fe25519* t0,
+ fe25519* t1,
+ fe25519* t2
+)
+{
+ fe25519 *z11 = r; // store z11 in r (in order to save one temporary).
+ fe25519 *z2_10_0 = t1;
+ fe25519 *z2_50_0 = t2;
+ fe25519 *z2_100_0 = z2_10_0;
+
+ uint8 i;
+
+ {
+ fe25519 *z2 = z2_50_0;
+
+ /* 2 */ fe25519_square(z2, x);
+ /* 4 */ fe25519_square(t0, z2);
+ /* 8 */ fe25519_square(t0, t0);
+ /* 9 */ fe25519_mul(z2_10_0, t0, x);
+ /* 11 */ fe25519_mul(z11, z2_10_0, z2);
+
+ // z2 is dead.
+ }
+
+ /* 22 */ fe25519_square(t0, z11);
+ /* 2^5 - 2^0 = 31 */ fe25519_mul(z2_10_0, t0, z2_10_0);
+
+ /* 2^6 - 2^1 */ fe25519_square(t0, z2_10_0);
+ /* 2^7 - 2^2 */ fe25519_square(t0, t0);
+ /* 2^8 - 2^3 */ fe25519_square(t0, t0);
+ /* 2^9 - 2^4 */ fe25519_square(t0, t0);
+ /* 2^10 - 2^5 */ fe25519_square(t0, t0);
+ /* 2^10 - 2^0 */ fe25519_mul(z2_10_0, t0, z2_10_0);
+
+ /* 2^11 - 2^1 */ fe25519_square(t0, z2_10_0);
+
+ /* 2^20 - 2^10 */ for (i = 1; i < 10; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^20 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0);
+
+ /* 2^21 - 2^1 */ fe25519_square(t0, z2_50_0);
+
+ /* 2^40 - 2^20 */ for (i = 1; i < 20; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^40 - 2^0 */ fe25519_mul(t0, t0, z2_50_0);
+
+ /* 2^41 - 2^1 */ fe25519_square(t0, t0);
+
+ /* 2^50 - 2^10 */ for (i = 1; i < 10; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^50 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0);
+
+ /* 2^51 - 2^1 */ fe25519_square(t0, z2_50_0);
+
+ /* 2^100 - 2^50 */ for (i = 1; i < 50; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^100 - 2^0 */ fe25519_mul(z2_100_0, t0, z2_50_0);
+
+ /* 2^101 - 2^1 */ fe25519_square(t0, z2_100_0);
+
+ /* 2^200 - 2^100 */ for (i = 1; i < 100; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^200 - 2^0 */ fe25519_mul(t0, t0, z2_100_0);
+
+ /* 2^250 - 2^50 */ for (i = 0; i < 50; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^250 - 2^0 */ fe25519_mul(t0, t0, z2_50_0);
+
+ /* 2^255 - 2^5 */ for (i = 0; i < 5; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^255 - 21 */ fe25519_mul(r, t0, z11);
+}
+
+static void
+fe25519_setzero(
+ fe25519* out
+)
+{
+ uint8 ctr;
+
+ for (ctr = 0; ctr < 8; ctr++)
+ {
+ out->as_uint32[ctr] = 0;
+ }
+}
+
+static void
+fe25519_setone(
+ fe25519* out
+)
+{
+ uint8 ctr;
+
+ out->as_uint32[0] = 1;
+
+ for (ctr = 1; ctr < 8; ctr++)
+ {
+ out->as_uint32[ctr] = 0;
+ }
+}
+
+static void
+fe25519_cswap(
+ fe25519* in1,
+ fe25519* in2,
+ int condition
+)
+{
+ int32 mask = condition;
+ uint32 ctr;
+
+ mask = -mask;
+
+ for (ctr = 0; ctr < 8; ctr++)
+ {
+ uint32 val1 = in1->as_uint32[ctr];
+ uint32 val2 = in2->as_uint32[ctr];
+ uint32 temp = val1;
+
+ val1 ^= mask & (val2 ^ val1);
+ val2 ^= mask & (val2 ^ temp);
+
+
+ in1->as_uint32[ctr] = val1;
+ in2->as_uint32[ctr] = val2;
+ }
+}
+
+// ****************************************************
+// Scalarmultiplication implementation.
+// ****************************************************
+
+typedef struct _ST_curve25519ladderstepWorkingState
+{
+ // The base point in affine coordinates
+ fe25519 x0;
+
+ // The two working points p, q, in projective coordinates. Possibly randomized.
+ fe25519 xp;
+ fe25519 zp;
+ fe25519 xq;
+ fe25519 zq;
+
+ UN_256bitValue s;
+
+ int nextScalarBitToProcess;
+ uint8 previousProcessedBit;
+} ST_curve25519ladderstepWorkingState;
+
+static void
+curve25519_ladderstep(
+ ST_curve25519ladderstepWorkingState* pState
+)
+{
+ // Implements the "ladd-1987-m-3" differential-addition-and-doubling formulas
+ // Source: 1987 Montgomery "Speeding the Pollard and elliptic curve methods of factorization", page 261,
+ // fifth and sixth displays, plus common-subexpression elimination.
+ //
+ // Notation from the explicit formulas database:
+ // (X2,Z2) corresponds to (xp,zp),
+ // (X3,Z3) corresponds to (xq,zq)
+ // Result (X4,Z4) (X5,Z5) expected in (xp,zp) and (xq,zq)
+ //
+ // A = X2+Z2; AA = A^2; B = X2-Z2; BB = B^2; E = AA-BB; C = X3+Z3; D = X3-Z3;
+ // DA = D*A; CB = C*B; t0 = DA+CB; t1 = t0^2; X5 = Z1*t1; t2 = DA-CB;
+ // t3 = t2^2; Z5 = X1*t3; X4 = AA*BB; t4 = a24*E; t5 = BB+t4; Z4 = E*t5 ;
+ //
+ // Re-Ordered for using less temporaries.
+
+ fe25519 t1, t2;
+
+ fe25519 *b1=&pState->xp; fe25519 *b2=&pState->zp;
+ fe25519 *b3=&pState->xq; fe25519 *b4=&pState->zq;
+
+ fe25519 *b5= &t1; fe25519 *b6=&t2;
+
+ fe25519_add(b5,b1,b2); // A = X2+Z2
+ fe25519_sub(b6,b1,b2); // B = X2-Z2
+ fe25519_add(b1,b3,b4); // C = X3+Z3
+ fe25519_sub(b2,b3,b4); // D = X3-Z3
+ fe25519_mul(b3,b2,b5); // DA= D*A
+ fe25519_mul(b2,b1,b6); // CB= C*B
+ fe25519_add(b1,b2,b3); // T0= DA+CB
+ fe25519_sub(b4,b3,b2); // T2= DA-CB
+ fe25519_square(b3,b1); // X5==T1= T0^2
+ fe25519_square(b1,b4); // T3= t2^2
+ fe25519_mul(b4,b1,&pState->x0); // Z5=X1*t3
+ fe25519_square(b1,b5); // AA=A^2
+ fe25519_square(b5,b6); // BB=B^2
+ fe25519_sub(b2,b1,b5); // E=AA-BB
+ fe25519_mul(b1,b5,b1); // X4= AA*BB
+ fe25519_mpyWith121666 (b6,b2); // T4 = a24*E
+ fe25519_add(b6,b6,b5); // T5 = BB + t4
+ fe25519_mul(b2,b6,b2); // Z4 = E*t5
+}
+
+static void
+curve25519_cswap(
+ ST_curve25519ladderstepWorkingState* state,
+ uint8 b
+)
+{
+ fe25519_cswap (&state->xp, &state->xq,b);
+ fe25519_cswap (&state->zp, &state->zq,b);
+}
+
+void
+x25519_scalar_mult(
+ uint8_t r[32],
+ const uint8_t s[32],
+ const uint8_t p[32]
+)
+{
+ ST_curve25519ladderstepWorkingState state;
+ unsigned char i;
+
+
+ // Prepare the scalar within the working state buffer.
+ for (i = 0; i < 32; i++)
+ {
+ state.s.as_uint8 [i] = s[i];
+ }
+ state.s.as_uint8 [0] &= 248;
+ state.s.as_uint8 [31] &= 127;
+ state.s.as_uint8 [31] |= 64;
+
+ // Copy the affine x-axis of the base point to the state.
+ fe25519_unpack (&state.x0, p);
+
+ // Prepare the working points within the working state struct.
+
+ fe25519_setone (&state.zq);
+ fe25519_cpy (&state.xq, &state.x0);
+
+ fe25519_setone(&state.xp);
+ fe25519_setzero(&state.zp);
+
+ state.nextScalarBitToProcess = 254;
+
+ state.previousProcessedBit = 0;
+
+ // Process all the bits except for the last three where we explicitly double the result.
+ while (state.nextScalarBitToProcess >= 0)
+ {
+ uint8 byteNo = state.nextScalarBitToProcess >> 3;
+ uint8 bitNo = state.nextScalarBitToProcess & 7;
+ uint8 bit;
+ uint8 swap;
+
+ bit = 1 & (state.s.as_uint8 [byteNo] >> bitNo);
+ swap = bit ^ state.previousProcessedBit;
+ state.previousProcessedBit = bit;
+ curve25519_cswap(&state, swap);
+ curve25519_ladderstep(&state);
+ state.nextScalarBitToProcess --;
+ }
+
+ curve25519_cswap(&state,state.previousProcessedBit);
+
+ // optimize for stack usage.
+ fe25519_invert_useProvidedScratchBuffers (&state.zp, &state.zp, &state.xq, &state.zq, &state.x0);
+ fe25519_mul(&state.xp, &state.xp, &state.zp);
+ fe25519_reduceCompletely(&state.xp);
+
+ fe25519_pack (r, &state.xp);
+}
diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S
new file mode 100644
index 0000000000..b62121adb7
--- /dev/null
+++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S
@@ -0,0 +1,1164 @@
+// Author: Ana Helena Sánchez, Björn Haase (second implementation).
+//
+// public domain
+//
+
+ .align 2
+ .global square256_asm
+ .type square256_asm, %function
+square256_asm:
+// ######################
+// ASM Square 256 refined karatsuba:
+// ######################
+ // sqr 256 Refined Karatsuba
+ // pInput in r1
+ // pResult in r0
+ // adheres to arm eabi calling convention.
+ push {r1,r4,r5,r6,r7,r14}
+ .syntax unified
+ mov r3,r8
+ mov r4,r9
+ mov r5,r10
+ mov r6,r11
+ mov r7,r12
+ .syntax divided
+ push {r3,r4,r5,r6,r7}
+ .syntax unified
+ mov r14,r0
+ .syntax divided
+ ldm r1!,{r4,r5,r6,r7}
+ // sqr 128 Refined Karatsuba
+ // Input in r4 ... r7
+ // Result in r0 ... r7
+ // clobbers all registers except for r14
+ .syntax unified
+ mov r0,r4
+ mov r1,r5
+ .syntax divided
+ sub r0,r6
+ sbc r1,r7
+ sbc r2,r2
+ eor r0,r2
+ eor r1,r2
+ sub r0,r2
+ sbc r1,r2
+ .syntax unified
+ mov r8,r0
+ mov r9,r1
+ mov r10,r6
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r4,r5
+ // Result in r0,r1,r2,r3
+ // Clobbers: r4-r6
+ // START: sqr 32
+ // Input operand in r4
+ // Result in r0 ,r1
+ // Clobbers: r2, r3
+ uxth r0,r4
+ lsr r1,r4,#16
+ .syntax unified
+ mov r2,r0
+ .syntax divided
+ mul r2,r1
+ mul r0,r0
+ mul r1,r1
+ lsr r3,r2,#15
+ lsl r2,r2,#17
+ add r0,r2
+ adc r1,r3
+ // End: sqr 32
+ // Result in r0 ,r1
+ sub r4,r5
+ sbc r6,r6
+ eor r4,r6
+ sub r4,r6
+ // START: sqr 32
+ // Input operand in r5
+ // Result in r2 ,r3
+ // Clobbers: r5, r6
+ uxth r2,r5
+ lsr r3,r5,#16
+ .syntax unified
+ mov r5,r2
+ .syntax divided
+ mul r5,r3
+ mul r2,r2
+ mul r3,r3
+ lsr r6,r5,#15
+ lsl r5,r5,#17
+ add r2,r5
+ adc r3,r6
+ // End: sqr 32
+ // Result in r2 ,r3
+ mov r6,#0
+ add r2,r1
+ adc r3,r6
+ // START: sqr 32
+ // Input operand in r4
+ // Result in r4 ,r5
+ // Clobbers: r1, r6
+ lsr r5,r4,#16
+ uxth r4,r4
+ .syntax unified
+ mov r1,r4
+ .syntax divided
+ mul r1,r5
+ mul r4,r4
+ mul r5,r5
+ lsr r6,r1,#15
+ lsl r1,r1,#17
+ add r4,r1
+ adc r5,r6
+ // End: sqr 32
+ // Result in r4 ,r5
+ .syntax unified
+ mov r1,r2
+ .syntax divided
+ sub r1,r4
+ sbc r2,r5
+ .syntax unified
+ mov r5,r3
+ .syntax divided
+ mov r6,#0
+ sbc r3,r6
+ add r1,r0
+ adc r2,r5
+ adc r3,r6
+ // END: sqr 64 Refined Karatsuba
+ // Result in r0,r1,r2,r3
+ // Leaves r6 zero.
+ .syntax unified
+ mov r6,r10
+ mov r10,r0
+ mov r11,r1
+ mov r12,r2
+ mov r1,r3
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r6,r7
+ // Result in r2,r3,r4,r5
+ // Clobbers: r0,r7,r6
+ // START: sqr 32
+ // Input operand in r6
+ // Result in r2 ,r3
+ // Clobbers: r4, r5
+ uxth r2,r6
+ lsr r3,r6,#16
+ .syntax unified
+ mov r4,r2
+ .syntax divided
+ mul r4,r3
+ mul r2,r2
+ mul r3,r3
+ lsr r5,r4,#15
+ lsl r4,r4,#17
+ add r2,r4
+ adc r3,r5
+ // End: sqr 32
+ // Result in r2 ,r3
+ sub r6,r7
+ sbc r4,r4
+ eor r6,r4
+ sub r6,r4
+ // START: sqr 32
+ // Input operand in r7
+ // Result in r4 ,r5
+ // Clobbers: r0, r7
+ uxth r4,r7
+ lsr r5,r7,#16
+ .syntax unified
+ mov r0,r4
+ .syntax divided
+ mul r0,r5
+ mul r4,r4
+ mul r5,r5
+ lsr r7,r0,#15
+ lsl r0,r0,#17
+ add r4,r0
+ adc r5,r7
+ // End: sqr 32
+ // Result in r4 ,r5
+ mov r7,#0
+ add r4,r3
+ adc r5,r7
+ // START: sqr 32
+ // Input operand in r6
+ // Result in r7 ,r0
+ // Clobbers: r6, r3
+ uxth r7,r6
+ lsr r0,r6,#16
+ .syntax unified
+ mov r6,r7
+ .syntax divided
+ mul r6,r0
+ mul r7,r7
+ mul r0,r0
+ lsr r3,r6,#15
+ lsl r6,r6,#17
+ add r7,r6
+ adc r0,r3
+ // End: sqr 32
+ // Result in r7 ,r0
+ .syntax unified
+ mov r3,r4
+ .syntax divided
+ sub r3,r7
+ sbc r4,r0
+ .syntax unified
+ mov r0,r5
+ .syntax divided
+ mov r6,#0
+ sbc r5,r6
+ add r3,r2
+ adc r4,r0
+ adc r5,r6
+ // END: sqr 64 Refined Karatsuba
+ // Result in r2,r3,r4,r5
+ // Leaves r6 zero.
+ .syntax unified
+ mov r0,r12
+ .syntax divided
+ add r2,r0
+ adc r3,r1
+ adc r4,r6
+ adc r5,r6
+ .syntax unified
+ mov r12,r2
+ mov r2,r8
+ mov r8,r3
+ mov r3,r9
+ mov r9,r4
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r2,r3
+ // Result in r6,r7,r0,r1
+ // Clobbers: r2,r3,r4
+ // START: sqr 32
+ // Input operand in r2
+ // Result in r6 ,r7
+ // Clobbers: r0, r1
+ uxth r6,r2
+ lsr r7,r2,#16
+ .syntax unified
+ mov r0,r6
+ .syntax divided
+ mul r0,r7
+ mul r6,r6
+ mul r7,r7
+ lsr r1,r0,#15
+ lsl r0,r0,#17
+ add r6,r0
+ adc r7,r1
+ // End: sqr 32
+ // Result in r6 ,r7
+ sub r2,r3
+ sbc r4,r4
+ eor r2,r4
+ sub r2,r4
+ // START: sqr 32
+ // Input operand in r3
+ // Result in r0 ,r1
+ // Clobbers: r3, r4
+ uxth r0,r3
+ lsr r1,r3,#16
+ .syntax unified
+ mov r3,r0
+ .syntax divided
+ mul r3,r1
+ mul r0,r0
+ mul r1,r1
+ lsr r4,r3,#15
+ lsl r3,r3,#17
+ add r0,r3
+ adc r1,r4
+ // End: sqr 32
+ // Result in r0 ,r1
+ mov r4,#0
+ add r0,r7
+ adc r1,r4
+ // START: sqr 32
+ // Input operand in r2
+ // Result in r3 ,r4
+ // Clobbers: r2, r7
+ uxth r3,r2
+ lsr r4,r2,#16
+ .syntax unified
+ mov r2,r3
+ .syntax divided
+ mul r2,r4
+ mul r3,r3
+ mul r4,r4
+ lsr r7,r2,#15
+ lsl r2,r2,#17
+ add r3,r2
+ adc r4,r7
+ // End: sqr 32
+ // Result in r3 ,r4
+ .syntax unified
+ mov r7,r0
+ .syntax divided
+ sub r7,r3
+ sbc r0,r4
+ .syntax unified
+ mov r2,r1
+ .syntax divided
+ mov r4,#0
+ sbc r1,r4
+ add r7,r6
+ adc r0,r2
+ adc r1,r4
+ // END: sqr 64 Refined Karatsuba
+ // Result in r6,r7,r0,r1
+ // Returns r4 as zero.
+ .syntax unified
+ mov r2,r12
+ mov r3,r8
+ mov r4,r9
+ .syntax divided
+ sub r2,r6
+ sbc r3,r7
+ .syntax unified
+ mov r6,r4
+ mov r7,r5
+ .syntax divided
+ sbc r4,r0
+ sbc r5,r1
+ mov r0,#0
+ sbc r6,r0
+ sbc r7,r0
+ .syntax unified
+ mov r0,r10
+ .syntax divided
+ add r2,r0
+ .syntax unified
+ mov r1,r11
+ .syntax divided
+ adc r3,r1
+ .syntax unified
+ mov r0,r12
+ .syntax divided
+ adc r4,r0
+ .syntax unified
+ mov r0,r8
+ .syntax divided
+ adc r5,r0
+ mov r0,#0
+ adc r6,r0
+ adc r7,r0
+ .syntax unified
+ mov r0,r10
+ .syntax divided
+ // END: sqr 128 Refined Karatsuba
+ // Result in r0 ... r7
+ push {r4,r5,r6,r7}
+ .syntax unified
+ mov r4,r14
+ .syntax divided
+ stm r4!,{r0,r1,r2,r3}
+ ldr r4,[SP,#36]
+ add r4,#16
+ ldm r4,{r4,r5,r6,r7}
+ // sqr 128 Refined Karatsuba
+ // Input in r4 ... r7
+ // Result in r0 ... r7
+ // clobbers all registers except for r14
+ .syntax unified
+ mov r0,r4
+ mov r1,r5
+ .syntax divided
+ sub r0,r6
+ sbc r1,r7
+ sbc r2,r2
+ eor r0,r2
+ eor r1,r2
+ sub r0,r2
+ sbc r1,r2
+ .syntax unified
+ mov r8,r0
+ mov r9,r1
+ mov r10,r6
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r4,r5
+ // Result in r0,r1,r2,r3
+ // Clobbers: r4-r6
+ // START: sqr 32
+ // Input operand in r4
+ // Result in r0 ,r1
+ // Clobbers: r2, r3
+ uxth r0,r4
+ lsr r1,r4,#16
+ .syntax unified
+ mov r2,r0
+ .syntax divided
+ mul r2,r1
+ mul r0,r0
+ mul r1,r1
+ lsr r3,r2,#15
+ lsl r2,r2,#17
+ add r0,r2
+ adc r1,r3
+ // End: sqr 32
+ // Result in r0 ,r1
+ sub r4,r5
+ sbc r6,r6
+ eor r4,r6
+ sub r4,r6
+ // START: sqr 32
+ // Input operand in r5
+ // Result in r2 ,r3
+ // Clobbers: r5, r6
+ uxth r2,r5
+ lsr r3,r5,#16
+ .syntax unified
+ mov r5,r2
+ .syntax divided
+ mul r5,r3
+ mul r2,r2
+ mul r3,r3
+ lsr r6,r5,#15
+ lsl r5,r5,#17
+ add r2,r5
+ adc r3,r6
+ // End: sqr 32
+ // Result in r2 ,r3
+ mov r6,#0
+ add r2,r1
+ adc r3,r6
+ // START: sqr 32
+ // Input operand in r4
+ // Result in r4 ,r5
+ // Clobbers: r1, r6
+ lsr r5,r4,#16
+ uxth r4,r4
+ .syntax unified
+ mov r1,r4
+ .syntax divided
+ mul r1,r5
+ mul r4,r4
+ mul r5,r5
+ lsr r6,r1,#15
+ lsl r1,r1,#17
+ add r4,r1
+ adc r5,r6
+ // End: sqr 32
+ // Result in r4 ,r5
+ .syntax unified
+ mov r1,r2
+ .syntax divided
+ sub r1,r4
+ sbc r2,r5
+ .syntax unified
+ mov r5,r3
+ .syntax divided
+ mov r6,#0
+ sbc r3,r6
+ add r1,r0
+ adc r2,r5
+ adc r3,r6
+ // END: sqr 64 Refined Karatsuba
+ // Result in r0,r1,r2,r3
+ // Leaves r6 zero.
+ .syntax unified
+ mov r6,r10
+ mov r10,r0
+ mov r11,r1
+ mov r12,r2
+ mov r1,r3
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r6,r7
+ // Result in r2,r3,r4,r5
+ // Clobbers: r0,r7,r6
+ // START: sqr 32
+ // Input operand in r6
+ // Result in r2 ,r3
+ // Clobbers: r4, r5
+ uxth r2,r6
+ lsr r3,r6,#16
+ .syntax unified
+ mov r4,r2
+ .syntax divided
+ mul r4,r3
+ mul r2,r2
+ mul r3,r3
+ lsr r5,r4,#15
+ lsl r4,r4,#17
+ add r2,r4
+ adc r3,r5
+ // End: sqr 32
+ // Result in r2 ,r3
+ sub r6,r7
+ sbc r4,r4
+ eor r6,r4
+ sub r6,r4
+ // START: sqr 32
+ // Input operand in r7
+ // Result in r4 ,r5
+ // Clobbers: r0, r7
+ uxth r4,r7
+ lsr r5,r7,#16
+ .syntax unified
+ mov r0,r4
+ .syntax divided
+ mul r0,r5
+ mul r4,r4
+ mul r5,r5
+ lsr r7,r0,#15
+ lsl r0,r0,#17
+ add r4,r0
+ adc r5,r7
+ // End: sqr 32
+ // Result in r4 ,r5
+ mov r7,#0
+ add r4,r3
+ adc r5,r7
+ // START: sqr 32
+ // Input operand in r6
+ // Result in r7 ,r0
+ // Clobbers: r6, r3
+ uxth r7,r6
+ lsr r0,r6,#16
+ .syntax unified
+ mov r6,r7
+ .syntax divided
+ mul r6,r0
+ mul r7,r7
+ mul r0,r0
+ lsr r3,r6,#15
+ lsl r6,r6,#17
+ add r7,r6
+ adc r0,r3
+ // End: sqr 32
+ // Result in r7 ,r0
+ .syntax unified
+ mov r3,r4
+ .syntax divided
+ sub r3,r7
+ sbc r4,r0
+ .syntax unified
+ mov r0,r5
+ .syntax divided
+ mov r6,#0
+ sbc r5,r6
+ add r3,r2
+ adc r4,r0
+ adc r5,r6
+ // END: sqr 64 Refined Karatsuba
+ // Result in r2,r3,r4,r5
+ // Leaves r6 zero.
+ .syntax unified
+ mov r0,r12
+ .syntax divided
+ add r2,r0
+ adc r3,r1
+ adc r4,r6
+ adc r5,r6
+ .syntax unified
+ mov r12,r2
+ mov r2,r8
+ mov r8,r3
+ mov r3,r9
+ mov r9,r4
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r2,r3
+ // Result in r6,r7,r0,r1
+ // Clobbers: r2,r3,r4
+ // START: sqr 32
+ // Input operand in r2
+ // Result in r6 ,r7
+ // Clobbers: r0, r1
+ uxth r6,r2
+ lsr r7,r2,#16
+ .syntax unified
+ mov r0,r6
+ .syntax divided
+ mul r0,r7
+ mul r6,r6
+ mul r7,r7
+ lsr r1,r0,#15
+ lsl r0,r0,#17
+ add r6,r0
+ adc r7,r1
+ // End: sqr 32
+ // Result in r6 ,r7
+ sub r2,r3
+ sbc r4,r4
+ eor r2,r4
+ sub r2,r4
+ // START: sqr 32
+ // Input operand in r3
+ // Result in r0 ,r1
+ // Clobbers: r3, r4
+ uxth r0,r3
+ lsr r1,r3,#16
+ .syntax unified
+ mov r3,r0
+ .syntax divided
+ mul r3,r1
+ mul r0,r0
+ mul r1,r1
+ lsr r4,r3,#15
+ lsl r3,r3,#17
+ add r0,r3
+ adc r1,r4
+ // End: sqr 32
+ // Result in r0 ,r1
+ mov r4,#0
+ add r0,r7
+ adc r1,r4
+ // START: sqr 32
+ // Input operand in r2
+ // Result in r3 ,r4
+ // Clobbers: r2, r7
+ uxth r3,r2
+ lsr r4,r2,#16
+ .syntax unified
+ mov r2,r3
+ .syntax divided
+ mul r2,r4
+ mul r3,r3
+ mul r4,r4
+ lsr r7,r2,#15
+ lsl r2,r2,#17
+ add r3,r2
+ adc r4,r7
+ // End: sqr 32
+ // Result in r3 ,r4
+ .syntax unified
+ mov r7,r0
+ .syntax divided
+ sub r7,r3
+ sbc r0,r4
+ .syntax unified
+ mov r2,r1
+ .syntax divided
+ mov r4,#0
+ sbc r1,r4
+ add r7,r6
+ adc r0,r2
+ adc r1,r4
+ // END: sqr 64 Refined Karatsuba
+ // Result in r6,r7,r0,r1
+ // Returns r4 as zero.
+ .syntax unified
+ mov r2,r12
+ mov r3,r8
+ mov r4,r9
+ .syntax divided
+ sub r2,r6
+ sbc r3,r7
+ .syntax unified
+ mov r6,r4
+ mov r7,r5
+ .syntax divided
+ sbc r4,r0
+ sbc r5,r1
+ mov r0,#0
+ sbc r6,r0
+ sbc r7,r0
+ .syntax unified
+ mov r0,r10
+ .syntax divided
+ add r2,r0
+ .syntax unified
+ mov r1,r11
+ .syntax divided
+ adc r3,r1
+ .syntax unified
+ mov r0,r12
+ .syntax divided
+ adc r4,r0
+ .syntax unified
+ mov r0,r8
+ .syntax divided
+ adc r5,r0
+ mov r0,#0
+ adc r6,r0
+ adc r7,r0
+ .syntax unified
+ mov r0,r10
+ .syntax divided
+ // END: sqr 128 Refined Karatsuba
+ // Result in r0 ... r7
+ .syntax unified
+ mov r8,r4
+ mov r9,r5
+ mov r10,r6
+ mov r11,r7
+ .syntax divided
+ pop {r4,r5,r6,r7}
+ add r0,r4
+ adc r1,r5
+ adc r2,r6
+ adc r3,r7
+ .syntax unified
+ mov r4,r8
+ mov r5,r9
+ mov r6,r10
+ mov r7,r11
+ mov r8,r0
+ .syntax divided
+ mov r0,#0
+ adc r4,r0
+ adc r5,r0
+ adc r6,r0
+ adc r7,r0
+ .syntax unified
+ mov r0,r8
+ .syntax divided
+ push {r0,r1,r2,r3,r4,r5,r6,r7}
+ ldr r4,[SP,#52]
+ ldm r4,{r0,r1,r2,r3,r4,r5,r6,r7}
+ sub r4,r0
+ sbc r5,r1
+ sbc r6,r2
+ sbc r7,r3
+ sbc r0,r0
+ eor r4,r0
+ eor r5,r0
+ eor r6,r0
+ eor r7,r0
+ sub r4,r0
+ sbc r5,r0
+ sbc r6,r0
+ sbc r7,r0
+ // sqr 128 Refined Karatsuba
+ // Input in r4 ... r7
+ // Result in r0 ... r7
+ // clobbers all registers except for r14
+ .syntax unified
+ mov r0,r4
+ mov r1,r5
+ .syntax divided
+ sub r0,r6
+ sbc r1,r7
+ sbc r2,r2
+ eor r0,r2
+ eor r1,r2
+ sub r0,r2
+ sbc r1,r2
+ .syntax unified
+ mov r8,r0
+ mov r9,r1
+ mov r10,r6
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r4,r5
+ // Result in r0,r1,r2,r3
+ // Clobbers: r4-r6
+ // START: sqr 32
+ // Input operand in r4
+ // Result in r0 ,r1
+ // Clobbers: r2, r3
+ uxth r0,r4
+ lsr r1,r4,#16
+ .syntax unified
+ mov r2,r0
+ .syntax divided
+ mul r2,r1
+ mul r0,r0
+ mul r1,r1
+ lsr r3,r2,#15
+ lsl r2,r2,#17
+ add r0,r2
+ adc r1,r3
+ // End: sqr 32
+ // Result in r0 ,r1
+ sub r4,r5
+ sbc r6,r6
+ eor r4,r6
+ sub r4,r6
+ // START: sqr 32
+ // Input operand in r5
+ // Result in r2 ,r3
+ // Clobbers: r5, r6
+ uxth r2,r5
+ lsr r3,r5,#16
+ .syntax unified
+ mov r5,r2
+ .syntax divided
+ mul r5,r3
+ mul r2,r2
+ mul r3,r3
+ lsr r6,r5,#15
+ lsl r5,r5,#17
+ add r2,r5
+ adc r3,r6
+ // End: sqr 32
+ // Result in r2 ,r3
+ mov r6,#0
+ add r2,r1
+ adc r3,r6
+ // START: sqr 32
+ // Input operand in r4
+ // Result in r4 ,r5
+ // Clobbers: r1, r6
+ lsr r5,r4,#16
+ uxth r4,r4
+ .syntax unified
+ mov r1,r4
+ .syntax divided
+ mul r1,r5
+ mul r4,r4
+ mul r5,r5
+ lsr r6,r1,#15
+ lsl r1,r1,#17
+ add r4,r1
+ adc r5,r6
+ // End: sqr 32
+ // Result in r4 ,r5
+ .syntax unified
+ mov r1,r2
+ .syntax divided
+ sub r1,r4
+ sbc r2,r5
+ .syntax unified
+ mov r5,r3
+ .syntax divided
+ mov r6,#0
+ sbc r3,r6
+ add r1,r0
+ adc r2,r5
+ adc r3,r6
+ // END: sqr 64 Refined Karatsuba
+ // Result in r0,r1,r2,r3
+ // Leaves r6 zero.
+ .syntax unified
+ mov r6,r10
+ mov r10,r0
+ mov r11,r1
+ mov r12,r2
+ mov r1,r3
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r6,r7
+ // Result in r2,r3,r4,r5
+ // Clobbers: r0,r7,r6
+ // START: sqr 32
+ // Input operand in r6
+ // Result in r2 ,r3
+ // Clobbers: r4, r5
+ uxth r2,r6
+ lsr r3,r6,#16
+ .syntax unified
+ mov r4,r2
+ .syntax divided
+ mul r4,r3
+ mul r2,r2
+ mul r3,r3
+ lsr r5,r4,#15
+ lsl r4,r4,#17
+ add r2,r4
+ adc r3,r5
+ // End: sqr 32
+ // Result in r2 ,r3
+ sub r6,r7
+ sbc r4,r4
+ eor r6,r4
+ sub r6,r4
+ // START: sqr 32
+ // Input operand in r7
+ // Result in r4 ,r5
+ // Clobbers: r0, r7
+ uxth r4,r7
+ lsr r5,r7,#16
+ .syntax unified
+ mov r0,r4
+ .syntax divided
+ mul r0,r5
+ mul r4,r4
+ mul r5,r5
+ lsr r7,r0,#15
+ lsl r0,r0,#17
+ add r4,r0
+ adc r5,r7
+ // End: sqr 32
+ // Result in r4 ,r5
+ mov r7,#0
+ add r4,r3
+ adc r5,r7
+ // START: sqr 32
+ // Input operand in r6
+ // Result in r7 ,r0
+ // Clobbers: r6, r3
+ uxth r7,r6
+ lsr r0,r6,#16
+ .syntax unified
+ mov r6,r7
+ .syntax divided
+ mul r6,r0
+ mul r7,r7
+ mul r0,r0
+ lsr r3,r6,#15
+ lsl r6,r6,#17
+ add r7,r6
+ adc r0,r3
+ // End: sqr 32
+ // Result in r7 ,r0
+ .syntax unified
+ mov r3,r4
+ .syntax divided
+ sub r3,r7
+ sbc r4,r0
+ .syntax unified
+ mov r0,r5
+ .syntax divided
+ mov r6,#0
+ sbc r5,r6
+ add r3,r2
+ adc r4,r0
+ adc r5,r6
+ // END: sqr 64 Refined Karatsuba
+ // Result in r2,r3,r4,r5
+ // Leaves r6 zero.
+ .syntax unified
+ mov r0,r12
+ .syntax divided
+ add r2,r0
+ adc r3,r1
+ adc r4,r6
+ adc r5,r6
+ .syntax unified
+ mov r12,r2
+ mov r2,r8
+ mov r8,r3
+ mov r3,r9
+ mov r9,r4
+ .syntax divided
+ // START: sqr 64 Refined Karatsuba
+ // Input operands in r2,r3
+ // Result in r6,r7,r0,r1
+ // Clobbers: r2,r3,r4
+ // START: sqr 32
+ // Input operand in r2
+ // Result in r6 ,r7
+ // Clobbers: r0, r1
+ uxth r6,r2
+ lsr r7,r2,#16
+ .syntax unified
+ mov r0,r6
+ .syntax divided
+ mul r0,r7
+ mul r6,r6
+ mul r7,r7
+ lsr r1,r0,#15
+ lsl r0,r0,#17
+ add r6,r0
+ adc r7,r1
+ // End: sqr 32
+ // Result in r6 ,r7
+ sub r2,r3
+ sbc r4,r4
+ eor r2,r4
+ sub r2,r4
+ // START: sqr 32
+ // Input operand in r3
+ // Result in r0 ,r1
+ // Clobbers: r3, r4
+ uxth r0,r3
+ lsr r1,r3,#16
+ .syntax unified
+ mov r3,r0
+ .syntax divided
+ mul r3,r1
+ mul r0,r0
+ mul r1,r1
+ lsr r4,r3,#15
+ lsl r3,r3,#17
+ add r0,r3
+ adc r1,r4
+ // End: sqr 32
+ // Result in r0 ,r1
+ mov r4,#0
+ add r0,r7
+ adc r1,r4
+ // START: sqr 32
+ // Input operand in r2
+ // Result in r3 ,r4
+ // Clobbers: r2, r7
+ uxth r3,r2
+ lsr r4,r2,#16
+ .syntax unified
+ mov r2,r3
+ .syntax divided
+ mul r2,r4
+ mul r3,r3
+ mul r4,r4
+ lsr r7,r2,#15
+ lsl r2,r2,#17
+ add r3,r2
+ adc r4,r7
+ // End: sqr 32
+ // Result in r3 ,r4
+ .syntax unified
+ mov r7,r0
+ .syntax divided
+ sub r7,r3
+ sbc r0,r4
+ .syntax unified
+ mov r2,r1
+ .syntax divided
+ mov r4,#0
+ sbc r1,r4
+ add r7,r6
+ adc r0,r2
+ adc r1,r4
+ // END: sqr 64 Refined Karatsuba
+ // Result in r6,r7,r0,r1
+ // Returns r4 as zero.
+ .syntax unified
+ mov r2,r12
+ mov r3,r8
+ mov r4,r9
+ .syntax divided
+ sub r2,r6
+ sbc r3,r7
+ .syntax unified
+ mov r6,r4
+ mov r7,r5
+ .syntax divided
+ sbc r4,r0
+ sbc r5,r1
+ mov r0,#0
+ sbc r6,r0
+ sbc r7,r0
+ .syntax unified
+ mov r0,r10
+ .syntax divided
+ add r2,r0
+ .syntax unified
+ mov r1,r11
+ .syntax divided
+ adc r3,r1
+ .syntax unified
+ mov r0,r12
+ .syntax divided
+ adc r4,r0
+ .syntax unified
+ mov r0,r8
+ .syntax divided
+ adc r5,r0
+ mov r0,#0
+ adc r6,r0
+ adc r7,r0
+ .syntax unified
+ mov r0,r10
+ .syntax divided
+ // END: sqr 128 Refined Karatsuba
+ // Result in r0 ... r7
+ mvn r0,r0
+ mvn r1,r1
+ mvn r2,r2
+ mvn r3,r3
+ mvn r4,r4
+ mvn r5,r5
+ mvn r6,r6
+ mvn r7,r7
+ .syntax unified
+ mov r8,r4
+ mov r9,r5
+ mov r10,r6
+ mov r11,r7
+ .syntax divided
+ mov r4,#143
+ asr r4,r4,#1
+ pop {r4,r5,r6,r7}
+ adc r0,r4
+ adc r1,r5
+ adc r2,r6
+ adc r3,r7
+ .syntax unified
+ mov r12,r4
+ .syntax divided
+ mov r4,#16
+ add r4,r14
+ stm r4!,{r0,r1,r2,r3}
+ .syntax unified
+ mov r4,r12
+ mov r0,r8
+ .syntax divided
+ adc r0,r4
+ .syntax unified
+ mov r8,r0
+ mov r1,r9
+ .syntax divided
+ adc r1,r5
+ .syntax unified
+ mov r9,r1
+ mov r2,r10
+ .syntax divided
+ adc r2,r6
+ .syntax unified
+ mov r10,r2
+ mov r3,r11
+ .syntax divided
+ adc r3,r7
+ .syntax unified
+ mov r11,r3
+ .syntax divided
+ mov r0,#0
+ adc r0,r0
+ .syntax unified
+ mov r12,r0
+ mov r0,r14
+ .syntax divided
+ ldm r0,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r0,r4
+ adc r1,r5
+ adc r2,r6
+ adc r3,r7
+ mov r4,#16
+ add r4,r14
+ stm r4!,{r0,r1,r2,r3}
+ .syntax unified
+ mov r14,r4
+ mov r0,r13
+ .syntax divided
+ ldm r0!,{r4,r5,r6,r7}
+ .syntax unified
+ mov r1,r8
+ .syntax divided
+ adc r4,r1
+ .syntax unified
+ mov r1,r9
+ .syntax divided
+ adc r5,r1
+ .syntax unified
+ mov r1,r10
+ .syntax divided
+ adc r6,r1
+ .syntax unified
+ mov r1,r11
+ .syntax divided
+ adc r7,r1
+ .syntax unified
+ mov r0,r14
+ .syntax divided
+ stm r0!,{r4,r5,r6,r7}
+ pop {r4,r5,r6,r7}
+ .syntax unified
+ mov r1,r12
+ .syntax divided
+ mov r2,#0
+ mvn r2,r2
+ adc r1,r2
+ asr r2,r1,#4
+ add r4,r1
+ adc r5,r2
+ adc r6,r2
+ adc r7,r2
+ stm r0!,{r4,r5,r6,r7}
+ pop {r3,r4,r5,r6,r7}
+ .syntax unified
+ mov r8,r3
+ mov r9,r4
+ mov r10,r5
+ mov r11,r6
+ mov r12,r7
+ .syntax divided
+ pop {r0,r4,r5,r6,r7,r15}
+//Cycle Count ASM-Version of 256 sqr (Refined Karatsuba) (Cortex M0): 793 (697 instructions).
+ .size square256_asm, .-square256_asm