From c830ecc1f728b722fde24a5da14a89f9223f291c Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Thu, 9 May 2019 13:35:30 +0900 Subject: core/cortex-m0/curve25519: Move code to third_party folder Also, add LICENSE file (some files are under CC0, some are public domain), and METADATA file. BRANCH=none BUG=chromium:884905 TEST=make buildall -j, which also include basic tests. Change-Id: Ib3a7eb9245a0634c4052064c3e36cbe2ddafbcb9 Signed-off-by: Nicolas Boichat Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/ec/+/1599761 Reviewed-by: Aseda Aboagye --- .../core/cortex-m0/curve25519/mpy121666.S | 181 +++ .../core/cortex-m0/curve25519/mul.S | 1111 +++++++++++++++++++ .../core/cortex-m0/curve25519/reduce25519.S | 163 +++ .../core/cortex-m0/curve25519/scalarmult.c | 588 ++++++++++ .../core/cortex-m0/curve25519/sqr.S | 1164 ++++++++++++++++++++ 5 files changed, 3207 insertions(+) create mode 100644 third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S create mode 100644 third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S create mode 100644 third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S create mode 100644 third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c create mode 100644 third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S (limited to 'third_party/unacl-curve25519/core') diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S new file mode 100644 index 0000000000..d2a467459b --- /dev/null +++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mpy121666.S @@ -0,0 +1,181 @@ +// Implementation of multiplication of an fe25519 bit value with the curve constant 121666. +// +// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG +// public domain. +// +// gnu assembler format. +// +// Generated and tested with C++ functions in the test subdirectory. +// +// ATTENTION: +// Not yet tested on target hardware. + + + .code 16 + .text + .align 2 + + .global fe25519_mpyWith121666_asm + .code 16 + .thumb_func + .type fe25519_mpyWith121666_asm, %function + +fe25519_mpyWith121666_asm: + push {r4,r5,r6,r7,r14} + ldr r7,=56130 + ldr r2,[r1,#28] + lsl r5,r2,#16 + lsr r6,r2,#16 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + lsl r2,r5,#1 + lsr r2,r2,#1 + str r2,[r0,#28] + lsr r5,r5,#31 + lsl r6,r6,#1 + orr r5,r6 + mov r6,#19 + mul r5,r6 + mov r6,#0 + ldr r2,[r1,#0] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#0] + mov r5,#0 + ldr r2,[r1,#4] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r6,r3 + adc r5,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r6,r2 + mov r2,#0 + adc r5,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r6,r2 + adc r5,r3 + str r6,[r0,#4] + mov r6,#0 + ldr r2,[r1,#8] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#8] + mov r5,#0 + ldr r2,[r1,#12] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r6,r3 + adc r5,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r6,r2 + mov r2,#0 + adc r5,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r6,r2 + adc r5,r3 + str r6,[r0,#12] + mov r6,#0 + ldr r2,[r1,#16] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#16] + mov r5,#0 + ldr r2,[r1,#20] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r6,r3 + adc r5,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r6,r2 + mov r2,#0 + adc r5,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r6,r2 + adc r5,r3 + str r6,[r0,#20] + mov r6,#0 + ldr r2,[r1,#24] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#24] + mov r5,#0 + ldr r2,[r0,#28] + add r6,r2 + str r6,[r0,#28] + pop {r4,r5,r6,r7,r15} + + .size fe25519_mpyWith121666_asm, .-fe25519_mpyWith121666_asm + diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S new file mode 100644 index 0000000000..366713a7a3 --- /dev/null +++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/mul.S @@ -0,0 +1,1111 @@ + .align 2 + .global multiply256x256_asm + .type multiply256x256_asm, %function +multiply256x256_asm: + push {r4-r7,lr} + mov r3, r8 + mov r4, r9 + mov r5, r10 + mov r6, r11 + push {r0-r6} + mov r12, r0 + mov r10, r2 + mov r11, r1 + mov r0,r2 + //ldm r0!, {r4,r5,r6,r7} + ldm r0!, {r4,r5} + add r0,#8 + ldm r1!, {r2,r3,r6,r7} + push {r0,r1} + /////////BEGIN LOW PART ////////////////////// + /////////MUL128///////////// + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + ////////////////////////// + mov r4, r12 + stm r4!, {r0,r1} + push {r4} + push {r0,r1} + mov r1, r10 + mov r10, r2 + ldm r1, {r0, r1, r4, r5} + mov r2, r4 + mov r7, r5 + sub r2, r0 + sbc r7, r1 + sbc r6, r6 + eor r2, r6 + eor r7, r6 + sub r2, r6 + sbc r7, r6 + push {r2, r7} + mov r2, r11 + mov r11, r3 + ldm r2, {r0, r1, r2, r3} + sub r0, r2 + sbc r1, r3 + sbc r7, r7 + eor r0, r7 + eor r1, r7 + sub r0, r7 + sbc r1, r7 + eor r7, r6 + mov r12, r7 + push {r0, r1} + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + mov r4, r10 + mov r5, r11 + eor r6, r6 + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r6 + mov r10, r2 + mov r11, r3 + pop {r2-r5} + push {r0, r1} + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + pop {r4, r5} + mov r6, r12 + mov r7, r12 + eor r0, r6 + eor r1, r6 + eor r2, r6 + eor r3, r6 + asr r6, r6, #1 + adc r0, r4 + adc r1, r5 + adc r4, r2 + adc r5, r3 + eor r2, r2 + adc r6,r2 + adc r7,r2 + pop {r2, r3} + mov r8, r2 + mov r9, r3 + add r2, r0 + adc r3, r1 + mov r0, r10 + mov r1, r11 + adc r4, r0 + adc r5, r1 + adc r6, r0 + adc r7, r1 + ////////END LOW PART///////////////////// + pop {r0} + stm r0!, {r2,r3} + pop {r1,r2} + push {r0} + push {r4-r7} + mov r10, r1 + mov r11, r2 + ldm r1!, {r4, r5} + ldm r2, {r2, r3} + /////////BEGIN HIGH PART//////////////// + /////////MUL128///////////// + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + push {r0,r1} + mov r1, r10 + mov r10, r2 + ldm r1, {r0, r1, r4, r5} + mov r2, r4 + mov r7, r5 + sub r2, r0 + sbc r7, r1 + sbc r6, r6 + eor r2, r6 + eor r7, r6 + sub r2, r6 + sbc r7, r6 + push {r2, r7} + mov r2, r11 + mov r11, r3 + ldm r2, {r0, r1, r2, r3} + sub r0, r2 + sbc r1, r3 + sbc r7, r7 + eor r0, r7 + eor r1, r7 + sub r0, r7 + sbc r1, r7 + eor r7, r6 + mov r12, r7 + push {r0, r1} + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + mov r4, r10 + mov r5, r11 + eor r6, r6 + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r6 + mov r10, r2 + mov r11, r3 + pop {r2-r5} + push {r0, r1} + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + pop {r4, r5} + mov r6, r12 + mov r7, r12 + eor r0, r6 + eor r1, r6 + eor r2, r6 + eor r3, r6 + asr r6, r6, #1 + adc r0, r4 + adc r1, r5 + adc r4, r2 + adc r5, r3 + eor r2, r2 + adc r6,r2 //0,1 + adc r7,r2 + pop {r2, r3} + mov r8, r2 + mov r9, r3 + add r2, r0 + adc r3, r1 + mov r0, r10 + mov r1, r11 + adc r4, r0 + adc r5, r1 + adc r6, r0 + adc r7, r1 + ////////END HIGH PART///////////////////// + mov r0, r8 + mov r1, r9 + mov r8, r6 + mov r9, r7 + pop {r6, r7} + add r0, r6 + adc r1, r7 + pop {r6, r7} + adc r2, r6 + adc r3, r7 + pop {r7} + stm r7!, {r0-r3} + mov r10, r7 + eor r0,r0 + mov r6, r8 + mov r7, r9 + adc r4, r0 + adc r5, r0 + adc r6, r0 + adc r7, r0 + pop {r0,r1,r2} + mov r12, r2 + push {r0, r4-r7} + ldm r1, {r0-r7} + sub r0, r4 + sbc r1, r5 + sbc r2, r6 + sbc r3, r7 + eor r4, r4 + sbc r4, r4 + eor r0, r4 + eor r1, r4 + eor r2, r4 + eor r3, r4 + sub r0, r4 + sbc r1, r4 + sbc r2, r4 + sbc r3, r4 + mov r6, r12 + mov r12, r4 //carry + mov r5, r10 + stm r5!, {r0-r3} + mov r11, r5 + mov r8, r0 + mov r9, r1 + ldm r6, {r0-r7} + sub r4, r0 + sbc r5, r1 + sbc r6, r2 + sbc r7, r3 + eor r0, r0 + sbc r0, r0 + eor r4, r0 + eor r5, r0 + eor r6, r0 + eor r7, r0 + sub r4, r0 + sbc r5, r0 + sbc r6, r0 + sbc r7, r0 + mov r1, r12 + eor r0, r1 + mov r1, r11 + stm r1!, {r4-r7} + push {r0} + mov r2, r8 + mov r3, r9 + /////////BEGIN MIDDLE PART//////////////// + /////////MUL128///////////// + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + push {r0,r1} + mov r1, r10 + mov r10, r2 + ldm r1, {r0, r1, r4, r5} + mov r2, r4 + mov r7, r5 + sub r2, r0 + sbc r7, r1 + sbc r6, r6 + eor r2, r6 + eor r7, r6 + sub r2, r6 + sbc r7, r6 + push {r2, r7} + mov r2, r11 + mov r11, r3 + ldm r2, {r0, r1, r2, r3} + sub r0, r2 + sbc r1, r3 + sbc r7, r7 + eor r0, r7 + eor r1, r7 + sub r0, r7 + sbc r1, r7 + eor r7, r6 + mov r12, r7 + push {r0, r1} + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + mov r4, r10 + mov r5, r11 + eor r6, r6 + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r6 + mov r10, r2 + mov r11, r3 + pop {r2-r5} + push {r0, r1} + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + pop {r4, r5} + mov r6, r12 + mov r7, r12 + eor r0, r6 + eor r1, r6 + eor r2, r6 + eor r3, r6 + asr r6, r6, #1 + adc r0, r4 + adc r1, r5 + adc r4, r2 + adc r5, r3 + eor r2, r2 + adc r6,r2 //0,1 + adc r7,r2 + pop {r2, r3} + mov r8, r2 + mov r9, r3 + add r2, r0 + adc r3, r1 + mov r0, r10 + mov r1, r11 + adc r4, r0 + adc r5, r1 + adc r6, r0 + adc r7, r1 + //////////END MIDDLE PART//////////////// + pop {r0,r1} //r0,r1 + mov r12, r0 //negative + eor r2, r0 + eor r3, r0 + eor r4, r0 + eor r5, r0 + eor r6, r0 + eor r7, r0 + push {r4-r7} + ldm r1!, {r4-r7} + mov r11, r1 //reference + mov r1, r9 + eor r1, r0 + mov r10, r4 + mov r4, r8 + asr r0, #1 + eor r0, r4 + mov r4, r10 + adc r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + eor r4, r4 + adc r4, r4 + mov r10, r4 //carry + mov r4, r11 + ldm r4, {r4-r7} + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + mov r9, r4 + mov r4, r11 + stm r4!, {r0-r3} + mov r11, r4 + pop {r0-r3} + mov r4, r9 + adc r4, r0 + adc r5, r1 + adc r6, r2 + adc r7, r3 + mov r1, #0 + adc r1, r1 + mov r0, r10 + mov r10, r1 //carry + asr r0, #1 + pop {r0-r3} + adc r4, r0 + adc r5, r1 + adc r6, r2 + adc r7, r3 + mov r8, r0 + mov r0, r11 + stm r0!, {r4-r7} + mov r11, r0 + mov r0, r8 + mov r6, r12 + mov r5, r10 + eor r4, r4 + adc r5, r6 + adc r6, r4 + add r0, r5 + adc r1, r6 + adc r2, r6 + adc r3, r6 + mov r7, r11 + stm r7!, {r0-r3} + pop {r3-r6} + mov r8, r3 + mov r9, r4 + mov r10, r5 + mov r11, r6 + pop {r4-r7,pc} + bx lr +.size multiply256x256_asm, .-multiply256x256_asm + diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S new file mode 100644 index 0000000000..9a3c29a0f6 --- /dev/null +++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/reduce25519.S @@ -0,0 +1,163 @@ +// Implementation of a partial reduction modulo 2^255 - 38. +// +// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG +// public domain. +// +// gnu assembler format. +// +// Generated and tested with C++ functions in the test subdirectory and on the target. +// + + .code 16 + + .text + .align 2 + + .global fe25519_reduceTo256Bits_asm + .code 16 + .thumb_func + .type fe25519_reduceTo256Bits_asm, %function + +fe25519_reduceTo256Bits_asm: + push {r4,r5,r6,r7,r14} + ldr r2,[r1,#60] + lsr r3,r2,#16 + uxth r2,r2 + mov r7,#38 + mul r2,r7 + mul r3,r7 + ldr r4,[r1,#28] + lsr r5,r3,#16 + lsl r3,r3,#16 + mov r6,#0 + add r4,r2 + adc r5,r6 + add r4,r3 + adc r5,r6 + lsl r2,r4,#1 + lsr r2,r2,#1 + str r2,[r0,#28] + lsr r4,r4,#31 + lsl r5,r5,#1 + orr r4,r5 + mov r2,#19 + mul r2,r4 + ldr r4,[r1,#0] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#32] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#0] + ldr r4,[r1,#4] + add r3,r4 + mov r2,#0 + adc r2,r6 + ldr r4,[r1,#36] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r3,r4 + adc r2,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r3,r4 + adc r2,r5 + str r3,[r0,#4] + ldr r4,[r1,#8] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#40] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#8] + ldr r4,[r1,#12] + add r3,r4 + mov r2,#0 + adc r2,r6 + ldr r4,[r1,#44] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r3,r4 + adc r2,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r3,r4 + adc r2,r5 + str r3,[r0,#12] + ldr r4,[r1,#16] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#48] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#16] + ldr r4,[r1,#20] + add r3,r4 + mov r2,#0 + adc r2,r6 + ldr r4,[r1,#52] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r3,r4 + adc r2,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r3,r4 + adc r2,r5 + str r3,[r0,#20] + ldr r4,[r1,#24] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#56] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#24] + ldr r4,[r0,#28] + add r4,r3 + str r4,[r0,#28] + pop {r4,r5,r6,r7,r15} + + .size fe25519_reduceTo256Bits_asm, .-fe25519_reduceTo256Bits_asm + diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c b/third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c new file mode 100644 index 0000000000..07e2b144e7 --- /dev/null +++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/scalarmult.c @@ -0,0 +1,588 @@ +/* ======================= + ============================ C/C++ HEADER FILE ============================= + ======================= + + Collection of all required submodules from naclM0 required for curve25519 + scalar multiplication (not including randomization, etc.) alone. + + Library naclM0 largely bases on work avrNacl of M. Hutter and P. Schwabe. + + Will compile to the two functions + + int + crypto_scalarmult_base_curve25519( + unsigned char* q, + const unsigned char* n + ); + + int + crypto_scalarmult_curve25519 ( + unsigned char* r, + const unsigned char* s, + const unsigned char* p + ); + + Requires inttypes.h header and the four external assembly functions + + extern void + fe25519_reduceTo256Bits_asm ( + fe25519 *res, + const UN_512bitValue *in + ); + + extern void + fe25519_mpyWith121666_asm ( + fe25519* out, + const fe25519* in + ); + + extern void + multiply256x256_asm ( + UN_512bitValue* result, + const UN_256bitValue* x, + const UN_256bitValue* y + ); + + extern void + square256_asm ( + UN_512bitValue* result, + const UN_256bitValue* x + ); + + \file scalarmult.c + + \Author B. Haase, Endress + Hauser Conducta GmbH & Co. KG + + Distributed under the conditions of the + Creative Commons CC0 1.0 Universal public domain dedication + ============================================================================*/ + +#include "curve25519.h" +#include "util.h" + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef uintptr_t uintptr; + +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; +typedef intptr_t intptr; + +// Note that it's important to define the unit8 as first union member, so that +// an array of uint8 may be used as initializer. +typedef union UN_256bitValue_ +{ + uint8 as_uint8[32]; + uint16 as_uint16[16]; + uint32 as_uint32[8]; + uint64 as_uint64[4]; +} UN_256bitValue; + +// Note that it's important to define the unit8 as first union member, so that +// an array of uint8 may be used as initializer. +typedef union UN_512bitValue_ +{ + uint8 as_uint8[64]; + uint16 as_uint16[32]; + uint32 as_uint32[16]; + uint64 as_uint64[8]; + UN_256bitValue as_256_bitValue[2]; +} UN_512bitValue; + +typedef UN_256bitValue fe25519; + +// **************************************************** +// Assembly functions. +// **************************************************** + +extern void +fe25519_reduceTo256Bits_asm( + fe25519 *res, + const UN_512bitValue *in +); + +#define fe25519_mpyWith121666 fe25519_mpyWith121666_asm +extern void +fe25519_mpyWith121666_asm ( + fe25519* out, + const fe25519* in +); + +#define multiply256x256 multiply256x256_asm +extern void +multiply256x256( + UN_512bitValue* result, + const UN_256bitValue* x, + const UN_256bitValue* y +); + +#define square256 square256_asm +extern void +square256( + UN_512bitValue* result, + const UN_256bitValue* x +); + +// **************************************************** +// C functions for fe25519 +// **************************************************** + +static void +fe25519_cpy( + fe25519* dest, + const fe25519* source +) +{ + memcpy(dest, source, 32); +} + +static void +fe25519_unpack( + fe25519* out, + const unsigned char in[32] +) +{ + memcpy(out, in, 32); + + out->as_uint8[31] &= 0x7f; // make sure that the last bit is cleared. +} + +static void +fe25519_sub( + fe25519* out, + const fe25519* baseValue, + const fe25519* valueToSubstract +) +{ + uint16 ctr; + int64 accu = 0; + + // First subtract the most significant word, so that we may + // reduce the result "on the fly". + accu = baseValue->as_uint32[7]; + accu -= valueToSubstract->as_uint32[7]; + + // We always set bit #31, and compensate this by subtracting 1 from the reduction + // value. + out->as_uint32[7] = ((uint32)accu) | 0x80000000ul; + + accu = 19 * ((int32)(accu >> 31) - 1); + // ^ "-1" is the compensation for the "| 0x80000000ul" above. + // This choice makes sure, that the result will be positive! + + for (ctr = 0; ctr < 7; ctr += 1) + { + accu += baseValue->as_uint32[ctr]; + accu -= valueToSubstract->as_uint32[ctr]; + + out->as_uint32[ctr] = (uint32)accu; + accu >>= 32; + } + accu += out->as_uint32[7]; + out->as_uint32[7] = (uint32)accu; +} + +static void +fe25519_add( + fe25519* out, + const fe25519* baseValue, + const fe25519* valueToAdd +) +{ + uint16 ctr = 0; + uint64 accu = 0; + + // We first add the most significant word, so that we may reduce + // "on the fly". + accu = baseValue->as_uint32[7]; + accu += valueToAdd->as_uint32[7]; + out->as_uint32[7] = ((uint32)accu) & 0x7ffffffful; + + accu = ((uint32)(accu >> 31)) * 19; + + for (ctr = 0; ctr < 7; ctr += 1) + { + accu += baseValue->as_uint32[ctr]; + accu += valueToAdd->as_uint32[ctr]; + + out->as_uint32[ctr] = (uint32)accu; + accu >>= 32; + } + accu += out->as_uint32[7]; + out->as_uint32[7] = (uint32)accu; +} + +static void +fe25519_mul( + fe25519* result, + const fe25519* in1, + const fe25519* in2 +) +{ + UN_512bitValue tmp; + + multiply256x256(&tmp, in1, in2); + fe25519_reduceTo256Bits_asm(result,&tmp); +} + +static void +fe25519_square( + fe25519* result, + const fe25519* in +) +{ + UN_512bitValue tmp; + + square256(&tmp, in); + fe25519_reduceTo256Bits_asm(result,&tmp); +} + +static void +fe25519_reduceCompletely( + fe25519* inout +) +{ + uint32 numberOfTimesToSubstractPrime; + uint32 initialGuessForNumberOfTimesToSubstractPrime = inout->as_uint32[7] >> + 31; + uint64 accu; + uint8 ctr; + + // add one additional 19 to the estimated number of reductions. + // Do the calculation without writing back the results to memory. + // + // The initial guess of required numbers of reductions is based + // on bit #32 of the most significant word. + // This initial guess may be wrong, since we might have a value + // v in the range + // 2^255 - 19 <= v < 2^255 + // . After adding 19 to the value, we will be having the correct + // Number of required subtractions. + accu = initialGuessForNumberOfTimesToSubstractPrime * 19 + 19; + + for (ctr = 0; ctr < 7; ctr++) + { + accu += inout->as_uint32[ctr]; + accu >>= 32; + } + accu += inout->as_uint32[7]; + + numberOfTimesToSubstractPrime = (uint32)(accu >> 31); + + // Do the reduction. + accu = numberOfTimesToSubstractPrime * 19; + + for (ctr = 0; ctr < 7; ctr++) + { + accu += inout->as_uint32[ctr]; + inout->as_uint32[ctr] = (uint32)accu; + accu >>= 32; + } + accu += inout->as_uint32[7]; + inout->as_uint32[7] = accu & 0x7ffffffful; +} + +/// We are already using a packed radix 16 representation for fe25519. The real use for this function +/// is for architectures that use more bits for storing a fe25519 in a representation where multiplication +/// may be calculated more efficiently. +/// Here we simply copy the data. +static void +fe25519_pack( + unsigned char out[32], + fe25519* in +) +{ + fe25519_reduceCompletely(in); + + memcpy(out, in, 32); +} + +// Note, that r and x are allowed to overlap! +static void +fe25519_invert_useProvidedScratchBuffers( + fe25519* r, + const fe25519* x, + fe25519* t0, + fe25519* t1, + fe25519* t2 +) +{ + fe25519 *z11 = r; // store z11 in r (in order to save one temporary). + fe25519 *z2_10_0 = t1; + fe25519 *z2_50_0 = t2; + fe25519 *z2_100_0 = z2_10_0; + + uint8 i; + + { + fe25519 *z2 = z2_50_0; + + /* 2 */ fe25519_square(z2, x); + /* 4 */ fe25519_square(t0, z2); + /* 8 */ fe25519_square(t0, t0); + /* 9 */ fe25519_mul(z2_10_0, t0, x); + /* 11 */ fe25519_mul(z11, z2_10_0, z2); + + // z2 is dead. + } + + /* 22 */ fe25519_square(t0, z11); + /* 2^5 - 2^0 = 31 */ fe25519_mul(z2_10_0, t0, z2_10_0); + + /* 2^6 - 2^1 */ fe25519_square(t0, z2_10_0); + /* 2^7 - 2^2 */ fe25519_square(t0, t0); + /* 2^8 - 2^3 */ fe25519_square(t0, t0); + /* 2^9 - 2^4 */ fe25519_square(t0, t0); + /* 2^10 - 2^5 */ fe25519_square(t0, t0); + /* 2^10 - 2^0 */ fe25519_mul(z2_10_0, t0, z2_10_0); + + /* 2^11 - 2^1 */ fe25519_square(t0, z2_10_0); + + /* 2^20 - 2^10 */ for (i = 1; i < 10; i ++) + { + fe25519_square(t0, t0); + } + /* 2^20 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0); + + /* 2^21 - 2^1 */ fe25519_square(t0, z2_50_0); + + /* 2^40 - 2^20 */ for (i = 1; i < 20; i ++) + { + fe25519_square(t0, t0); + } + /* 2^40 - 2^0 */ fe25519_mul(t0, t0, z2_50_0); + + /* 2^41 - 2^1 */ fe25519_square(t0, t0); + + /* 2^50 - 2^10 */ for (i = 1; i < 10; i ++) + { + fe25519_square(t0, t0); + } + /* 2^50 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0); + + /* 2^51 - 2^1 */ fe25519_square(t0, z2_50_0); + + /* 2^100 - 2^50 */ for (i = 1; i < 50; i ++) + { + fe25519_square(t0, t0); + } + /* 2^100 - 2^0 */ fe25519_mul(z2_100_0, t0, z2_50_0); + + /* 2^101 - 2^1 */ fe25519_square(t0, z2_100_0); + + /* 2^200 - 2^100 */ for (i = 1; i < 100; i ++) + { + fe25519_square(t0, t0); + } + /* 2^200 - 2^0 */ fe25519_mul(t0, t0, z2_100_0); + + /* 2^250 - 2^50 */ for (i = 0; i < 50; i ++) + { + fe25519_square(t0, t0); + } + /* 2^250 - 2^0 */ fe25519_mul(t0, t0, z2_50_0); + + /* 2^255 - 2^5 */ for (i = 0; i < 5; i ++) + { + fe25519_square(t0, t0); + } + /* 2^255 - 21 */ fe25519_mul(r, t0, z11); +} + +static void +fe25519_setzero( + fe25519* out +) +{ + uint8 ctr; + + for (ctr = 0; ctr < 8; ctr++) + { + out->as_uint32[ctr] = 0; + } +} + +static void +fe25519_setone( + fe25519* out +) +{ + uint8 ctr; + + out->as_uint32[0] = 1; + + for (ctr = 1; ctr < 8; ctr++) + { + out->as_uint32[ctr] = 0; + } +} + +static void +fe25519_cswap( + fe25519* in1, + fe25519* in2, + int condition +) +{ + int32 mask = condition; + uint32 ctr; + + mask = -mask; + + for (ctr = 0; ctr < 8; ctr++) + { + uint32 val1 = in1->as_uint32[ctr]; + uint32 val2 = in2->as_uint32[ctr]; + uint32 temp = val1; + + val1 ^= mask & (val2 ^ val1); + val2 ^= mask & (val2 ^ temp); + + + in1->as_uint32[ctr] = val1; + in2->as_uint32[ctr] = val2; + } +} + +// **************************************************** +// Scalarmultiplication implementation. +// **************************************************** + +typedef struct _ST_curve25519ladderstepWorkingState +{ + // The base point in affine coordinates + fe25519 x0; + + // The two working points p, q, in projective coordinates. Possibly randomized. + fe25519 xp; + fe25519 zp; + fe25519 xq; + fe25519 zq; + + UN_256bitValue s; + + int nextScalarBitToProcess; + uint8 previousProcessedBit; +} ST_curve25519ladderstepWorkingState; + +static void +curve25519_ladderstep( + ST_curve25519ladderstepWorkingState* pState +) +{ + // Implements the "ladd-1987-m-3" differential-addition-and-doubling formulas + // Source: 1987 Montgomery "Speeding the Pollard and elliptic curve methods of factorization", page 261, + // fifth and sixth displays, plus common-subexpression elimination. + // + // Notation from the explicit formulas database: + // (X2,Z2) corresponds to (xp,zp), + // (X3,Z3) corresponds to (xq,zq) + // Result (X4,Z4) (X5,Z5) expected in (xp,zp) and (xq,zq) + // + // A = X2+Z2; AA = A^2; B = X2-Z2; BB = B^2; E = AA-BB; C = X3+Z3; D = X3-Z3; + // DA = D*A; CB = C*B; t0 = DA+CB; t1 = t0^2; X5 = Z1*t1; t2 = DA-CB; + // t3 = t2^2; Z5 = X1*t3; X4 = AA*BB; t4 = a24*E; t5 = BB+t4; Z4 = E*t5 ; + // + // Re-Ordered for using less temporaries. + + fe25519 t1, t2; + + fe25519 *b1=&pState->xp; fe25519 *b2=&pState->zp; + fe25519 *b3=&pState->xq; fe25519 *b4=&pState->zq; + + fe25519 *b5= &t1; fe25519 *b6=&t2; + + fe25519_add(b5,b1,b2); // A = X2+Z2 + fe25519_sub(b6,b1,b2); // B = X2-Z2 + fe25519_add(b1,b3,b4); // C = X3+Z3 + fe25519_sub(b2,b3,b4); // D = X3-Z3 + fe25519_mul(b3,b2,b5); // DA= D*A + fe25519_mul(b2,b1,b6); // CB= C*B + fe25519_add(b1,b2,b3); // T0= DA+CB + fe25519_sub(b4,b3,b2); // T2= DA-CB + fe25519_square(b3,b1); // X5==T1= T0^2 + fe25519_square(b1,b4); // T3= t2^2 + fe25519_mul(b4,b1,&pState->x0); // Z5=X1*t3 + fe25519_square(b1,b5); // AA=A^2 + fe25519_square(b5,b6); // BB=B^2 + fe25519_sub(b2,b1,b5); // E=AA-BB + fe25519_mul(b1,b5,b1); // X4= AA*BB + fe25519_mpyWith121666 (b6,b2); // T4 = a24*E + fe25519_add(b6,b6,b5); // T5 = BB + t4 + fe25519_mul(b2,b6,b2); // Z4 = E*t5 +} + +static void +curve25519_cswap( + ST_curve25519ladderstepWorkingState* state, + uint8 b +) +{ + fe25519_cswap (&state->xp, &state->xq,b); + fe25519_cswap (&state->zp, &state->zq,b); +} + +void +x25519_scalar_mult( + uint8_t r[32], + const uint8_t s[32], + const uint8_t p[32] +) +{ + ST_curve25519ladderstepWorkingState state; + unsigned char i; + + + // Prepare the scalar within the working state buffer. + for (i = 0; i < 32; i++) + { + state.s.as_uint8 [i] = s[i]; + } + state.s.as_uint8 [0] &= 248; + state.s.as_uint8 [31] &= 127; + state.s.as_uint8 [31] |= 64; + + // Copy the affine x-axis of the base point to the state. + fe25519_unpack (&state.x0, p); + + // Prepare the working points within the working state struct. + + fe25519_setone (&state.zq); + fe25519_cpy (&state.xq, &state.x0); + + fe25519_setone(&state.xp); + fe25519_setzero(&state.zp); + + state.nextScalarBitToProcess = 254; + + state.previousProcessedBit = 0; + + // Process all the bits except for the last three where we explicitly double the result. + while (state.nextScalarBitToProcess >= 0) + { + uint8 byteNo = state.nextScalarBitToProcess >> 3; + uint8 bitNo = state.nextScalarBitToProcess & 7; + uint8 bit; + uint8 swap; + + bit = 1 & (state.s.as_uint8 [byteNo] >> bitNo); + swap = bit ^ state.previousProcessedBit; + state.previousProcessedBit = bit; + curve25519_cswap(&state, swap); + curve25519_ladderstep(&state); + state.nextScalarBitToProcess --; + } + + curve25519_cswap(&state,state.previousProcessedBit); + + // optimize for stack usage. + fe25519_invert_useProvidedScratchBuffers (&state.zp, &state.zp, &state.xq, &state.zq, &state.x0); + fe25519_mul(&state.xp, &state.xp, &state.zp); + fe25519_reduceCompletely(&state.xp); + + fe25519_pack (r, &state.xp); +} diff --git a/third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S b/third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S new file mode 100644 index 0000000000..b62121adb7 --- /dev/null +++ b/third_party/unacl-curve25519/core/cortex-m0/curve25519/sqr.S @@ -0,0 +1,1164 @@ +// Author: Ana Helena Sánchez, Björn Haase (second implementation). +// +// public domain +// + + .align 2 + .global square256_asm + .type square256_asm, %function +square256_asm: +// ###################### +// ASM Square 256 refined karatsuba: +// ###################### + // sqr 256 Refined Karatsuba + // pInput in r1 + // pResult in r0 + // adheres to arm eabi calling convention. + push {r1,r4,r5,r6,r7,r14} + .syntax unified + mov r3,r8 + mov r4,r9 + mov r5,r10 + mov r6,r11 + mov r7,r12 + .syntax divided + push {r3,r4,r5,r6,r7} + .syntax unified + mov r14,r0 + .syntax divided + ldm r1!,{r4,r5,r6,r7} + // sqr 128 Refined Karatsuba + // Input in r4 ... r7 + // Result in r0 ... r7 + // clobbers all registers except for r14 + .syntax unified + mov r0,r4 + mov r1,r5 + .syntax divided + sub r0,r6 + sbc r1,r7 + sbc r2,r2 + eor r0,r2 + eor r1,r2 + sub r0,r2 + sbc r1,r2 + .syntax unified + mov r8,r0 + mov r9,r1 + mov r10,r6 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r4,r5 + // Result in r0,r1,r2,r3 + // Clobbers: r4-r6 + // START: sqr 32 + // Input operand in r4 + // Result in r0 ,r1 + // Clobbers: r2, r3 + uxth r0,r4 + lsr r1,r4,#16 + .syntax unified + mov r2,r0 + .syntax divided + mul r2,r1 + mul r0,r0 + mul r1,r1 + lsr r3,r2,#15 + lsl r2,r2,#17 + add r0,r2 + adc r1,r3 + // End: sqr 32 + // Result in r0 ,r1 + sub r4,r5 + sbc r6,r6 + eor r4,r6 + sub r4,r6 + // START: sqr 32 + // Input operand in r5 + // Result in r2 ,r3 + // Clobbers: r5, r6 + uxth r2,r5 + lsr r3,r5,#16 + .syntax unified + mov r5,r2 + .syntax divided + mul r5,r3 + mul r2,r2 + mul r3,r3 + lsr r6,r5,#15 + lsl r5,r5,#17 + add r2,r5 + adc r3,r6 + // End: sqr 32 + // Result in r2 ,r3 + mov r6,#0 + add r2,r1 + adc r3,r6 + // START: sqr 32 + // Input operand in r4 + // Result in r4 ,r5 + // Clobbers: r1, r6 + lsr r5,r4,#16 + uxth r4,r4 + .syntax unified + mov r1,r4 + .syntax divided + mul r1,r5 + mul r4,r4 + mul r5,r5 + lsr r6,r1,#15 + lsl r1,r1,#17 + add r4,r1 + adc r5,r6 + // End: sqr 32 + // Result in r4 ,r5 + .syntax unified + mov r1,r2 + .syntax divided + sub r1,r4 + sbc r2,r5 + .syntax unified + mov r5,r3 + .syntax divided + mov r6,#0 + sbc r3,r6 + add r1,r0 + adc r2,r5 + adc r3,r6 + // END: sqr 64 Refined Karatsuba + // Result in r0,r1,r2,r3 + // Leaves r6 zero. + .syntax unified + mov r6,r10 + mov r10,r0 + mov r11,r1 + mov r12,r2 + mov r1,r3 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r6,r7 + // Result in r2,r3,r4,r5 + // Clobbers: r0,r7,r6 + // START: sqr 32 + // Input operand in r6 + // Result in r2 ,r3 + // Clobbers: r4, r5 + uxth r2,r6 + lsr r3,r6,#16 + .syntax unified + mov r4,r2 + .syntax divided + mul r4,r3 + mul r2,r2 + mul r3,r3 + lsr r5,r4,#15 + lsl r4,r4,#17 + add r2,r4 + adc r3,r5 + // End: sqr 32 + // Result in r2 ,r3 + sub r6,r7 + sbc r4,r4 + eor r6,r4 + sub r6,r4 + // START: sqr 32 + // Input operand in r7 + // Result in r4 ,r5 + // Clobbers: r0, r7 + uxth r4,r7 + lsr r5,r7,#16 + .syntax unified + mov r0,r4 + .syntax divided + mul r0,r5 + mul r4,r4 + mul r5,r5 + lsr r7,r0,#15 + lsl r0,r0,#17 + add r4,r0 + adc r5,r7 + // End: sqr 32 + // Result in r4 ,r5 + mov r7,#0 + add r4,r3 + adc r5,r7 + // START: sqr 32 + // Input operand in r6 + // Result in r7 ,r0 + // Clobbers: r6, r3 + uxth r7,r6 + lsr r0,r6,#16 + .syntax unified + mov r6,r7 + .syntax divided + mul r6,r0 + mul r7,r7 + mul r0,r0 + lsr r3,r6,#15 + lsl r6,r6,#17 + add r7,r6 + adc r0,r3 + // End: sqr 32 + // Result in r7 ,r0 + .syntax unified + mov r3,r4 + .syntax divided + sub r3,r7 + sbc r4,r0 + .syntax unified + mov r0,r5 + .syntax divided + mov r6,#0 + sbc r5,r6 + add r3,r2 + adc r4,r0 + adc r5,r6 + // END: sqr 64 Refined Karatsuba + // Result in r2,r3,r4,r5 + // Leaves r6 zero. + .syntax unified + mov r0,r12 + .syntax divided + add r2,r0 + adc r3,r1 + adc r4,r6 + adc r5,r6 + .syntax unified + mov r12,r2 + mov r2,r8 + mov r8,r3 + mov r3,r9 + mov r9,r4 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r2,r3 + // Result in r6,r7,r0,r1 + // Clobbers: r2,r3,r4 + // START: sqr 32 + // Input operand in r2 + // Result in r6 ,r7 + // Clobbers: r0, r1 + uxth r6,r2 + lsr r7,r2,#16 + .syntax unified + mov r0,r6 + .syntax divided + mul r0,r7 + mul r6,r6 + mul r7,r7 + lsr r1,r0,#15 + lsl r0,r0,#17 + add r6,r0 + adc r7,r1 + // End: sqr 32 + // Result in r6 ,r7 + sub r2,r3 + sbc r4,r4 + eor r2,r4 + sub r2,r4 + // START: sqr 32 + // Input operand in r3 + // Result in r0 ,r1 + // Clobbers: r3, r4 + uxth r0,r3 + lsr r1,r3,#16 + .syntax unified + mov r3,r0 + .syntax divided + mul r3,r1 + mul r0,r0 + mul r1,r1 + lsr r4,r3,#15 + lsl r3,r3,#17 + add r0,r3 + adc r1,r4 + // End: sqr 32 + // Result in r0 ,r1 + mov r4,#0 + add r0,r7 + adc r1,r4 + // START: sqr 32 + // Input operand in r2 + // Result in r3 ,r4 + // Clobbers: r2, r7 + uxth r3,r2 + lsr r4,r2,#16 + .syntax unified + mov r2,r3 + .syntax divided + mul r2,r4 + mul r3,r3 + mul r4,r4 + lsr r7,r2,#15 + lsl r2,r2,#17 + add r3,r2 + adc r4,r7 + // End: sqr 32 + // Result in r3 ,r4 + .syntax unified + mov r7,r0 + .syntax divided + sub r7,r3 + sbc r0,r4 + .syntax unified + mov r2,r1 + .syntax divided + mov r4,#0 + sbc r1,r4 + add r7,r6 + adc r0,r2 + adc r1,r4 + // END: sqr 64 Refined Karatsuba + // Result in r6,r7,r0,r1 + // Returns r4 as zero. + .syntax unified + mov r2,r12 + mov r3,r8 + mov r4,r9 + .syntax divided + sub r2,r6 + sbc r3,r7 + .syntax unified + mov r6,r4 + mov r7,r5 + .syntax divided + sbc r4,r0 + sbc r5,r1 + mov r0,#0 + sbc r6,r0 + sbc r7,r0 + .syntax unified + mov r0,r10 + .syntax divided + add r2,r0 + .syntax unified + mov r1,r11 + .syntax divided + adc r3,r1 + .syntax unified + mov r0,r12 + .syntax divided + adc r4,r0 + .syntax unified + mov r0,r8 + .syntax divided + adc r5,r0 + mov r0,#0 + adc r6,r0 + adc r7,r0 + .syntax unified + mov r0,r10 + .syntax divided + // END: sqr 128 Refined Karatsuba + // Result in r0 ... r7 + push {r4,r5,r6,r7} + .syntax unified + mov r4,r14 + .syntax divided + stm r4!,{r0,r1,r2,r3} + ldr r4,[SP,#36] + add r4,#16 + ldm r4,{r4,r5,r6,r7} + // sqr 128 Refined Karatsuba + // Input in r4 ... r7 + // Result in r0 ... r7 + // clobbers all registers except for r14 + .syntax unified + mov r0,r4 + mov r1,r5 + .syntax divided + sub r0,r6 + sbc r1,r7 + sbc r2,r2 + eor r0,r2 + eor r1,r2 + sub r0,r2 + sbc r1,r2 + .syntax unified + mov r8,r0 + mov r9,r1 + mov r10,r6 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r4,r5 + // Result in r0,r1,r2,r3 + // Clobbers: r4-r6 + // START: sqr 32 + // Input operand in r4 + // Result in r0 ,r1 + // Clobbers: r2, r3 + uxth r0,r4 + lsr r1,r4,#16 + .syntax unified + mov r2,r0 + .syntax divided + mul r2,r1 + mul r0,r0 + mul r1,r1 + lsr r3,r2,#15 + lsl r2,r2,#17 + add r0,r2 + adc r1,r3 + // End: sqr 32 + // Result in r0 ,r1 + sub r4,r5 + sbc r6,r6 + eor r4,r6 + sub r4,r6 + // START: sqr 32 + // Input operand in r5 + // Result in r2 ,r3 + // Clobbers: r5, r6 + uxth r2,r5 + lsr r3,r5,#16 + .syntax unified + mov r5,r2 + .syntax divided + mul r5,r3 + mul r2,r2 + mul r3,r3 + lsr r6,r5,#15 + lsl r5,r5,#17 + add r2,r5 + adc r3,r6 + // End: sqr 32 + // Result in r2 ,r3 + mov r6,#0 + add r2,r1 + adc r3,r6 + // START: sqr 32 + // Input operand in r4 + // Result in r4 ,r5 + // Clobbers: r1, r6 + lsr r5,r4,#16 + uxth r4,r4 + .syntax unified + mov r1,r4 + .syntax divided + mul r1,r5 + mul r4,r4 + mul r5,r5 + lsr r6,r1,#15 + lsl r1,r1,#17 + add r4,r1 + adc r5,r6 + // End: sqr 32 + // Result in r4 ,r5 + .syntax unified + mov r1,r2 + .syntax divided + sub r1,r4 + sbc r2,r5 + .syntax unified + mov r5,r3 + .syntax divided + mov r6,#0 + sbc r3,r6 + add r1,r0 + adc r2,r5 + adc r3,r6 + // END: sqr 64 Refined Karatsuba + // Result in r0,r1,r2,r3 + // Leaves r6 zero. + .syntax unified + mov r6,r10 + mov r10,r0 + mov r11,r1 + mov r12,r2 + mov r1,r3 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r6,r7 + // Result in r2,r3,r4,r5 + // Clobbers: r0,r7,r6 + // START: sqr 32 + // Input operand in r6 + // Result in r2 ,r3 + // Clobbers: r4, r5 + uxth r2,r6 + lsr r3,r6,#16 + .syntax unified + mov r4,r2 + .syntax divided + mul r4,r3 + mul r2,r2 + mul r3,r3 + lsr r5,r4,#15 + lsl r4,r4,#17 + add r2,r4 + adc r3,r5 + // End: sqr 32 + // Result in r2 ,r3 + sub r6,r7 + sbc r4,r4 + eor r6,r4 + sub r6,r4 + // START: sqr 32 + // Input operand in r7 + // Result in r4 ,r5 + // Clobbers: r0, r7 + uxth r4,r7 + lsr r5,r7,#16 + .syntax unified + mov r0,r4 + .syntax divided + mul r0,r5 + mul r4,r4 + mul r5,r5 + lsr r7,r0,#15 + lsl r0,r0,#17 + add r4,r0 + adc r5,r7 + // End: sqr 32 + // Result in r4 ,r5 + mov r7,#0 + add r4,r3 + adc r5,r7 + // START: sqr 32 + // Input operand in r6 + // Result in r7 ,r0 + // Clobbers: r6, r3 + uxth r7,r6 + lsr r0,r6,#16 + .syntax unified + mov r6,r7 + .syntax divided + mul r6,r0 + mul r7,r7 + mul r0,r0 + lsr r3,r6,#15 + lsl r6,r6,#17 + add r7,r6 + adc r0,r3 + // End: sqr 32 + // Result in r7 ,r0 + .syntax unified + mov r3,r4 + .syntax divided + sub r3,r7 + sbc r4,r0 + .syntax unified + mov r0,r5 + .syntax divided + mov r6,#0 + sbc r5,r6 + add r3,r2 + adc r4,r0 + adc r5,r6 + // END: sqr 64 Refined Karatsuba + // Result in r2,r3,r4,r5 + // Leaves r6 zero. + .syntax unified + mov r0,r12 + .syntax divided + add r2,r0 + adc r3,r1 + adc r4,r6 + adc r5,r6 + .syntax unified + mov r12,r2 + mov r2,r8 + mov r8,r3 + mov r3,r9 + mov r9,r4 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r2,r3 + // Result in r6,r7,r0,r1 + // Clobbers: r2,r3,r4 + // START: sqr 32 + // Input operand in r2 + // Result in r6 ,r7 + // Clobbers: r0, r1 + uxth r6,r2 + lsr r7,r2,#16 + .syntax unified + mov r0,r6 + .syntax divided + mul r0,r7 + mul r6,r6 + mul r7,r7 + lsr r1,r0,#15 + lsl r0,r0,#17 + add r6,r0 + adc r7,r1 + // End: sqr 32 + // Result in r6 ,r7 + sub r2,r3 + sbc r4,r4 + eor r2,r4 + sub r2,r4 + // START: sqr 32 + // Input operand in r3 + // Result in r0 ,r1 + // Clobbers: r3, r4 + uxth r0,r3 + lsr r1,r3,#16 + .syntax unified + mov r3,r0 + .syntax divided + mul r3,r1 + mul r0,r0 + mul r1,r1 + lsr r4,r3,#15 + lsl r3,r3,#17 + add r0,r3 + adc r1,r4 + // End: sqr 32 + // Result in r0 ,r1 + mov r4,#0 + add r0,r7 + adc r1,r4 + // START: sqr 32 + // Input operand in r2 + // Result in r3 ,r4 + // Clobbers: r2, r7 + uxth r3,r2 + lsr r4,r2,#16 + .syntax unified + mov r2,r3 + .syntax divided + mul r2,r4 + mul r3,r3 + mul r4,r4 + lsr r7,r2,#15 + lsl r2,r2,#17 + add r3,r2 + adc r4,r7 + // End: sqr 32 + // Result in r3 ,r4 + .syntax unified + mov r7,r0 + .syntax divided + sub r7,r3 + sbc r0,r4 + .syntax unified + mov r2,r1 + .syntax divided + mov r4,#0 + sbc r1,r4 + add r7,r6 + adc r0,r2 + adc r1,r4 + // END: sqr 64 Refined Karatsuba + // Result in r6,r7,r0,r1 + // Returns r4 as zero. + .syntax unified + mov r2,r12 + mov r3,r8 + mov r4,r9 + .syntax divided + sub r2,r6 + sbc r3,r7 + .syntax unified + mov r6,r4 + mov r7,r5 + .syntax divided + sbc r4,r0 + sbc r5,r1 + mov r0,#0 + sbc r6,r0 + sbc r7,r0 + .syntax unified + mov r0,r10 + .syntax divided + add r2,r0 + .syntax unified + mov r1,r11 + .syntax divided + adc r3,r1 + .syntax unified + mov r0,r12 + .syntax divided + adc r4,r0 + .syntax unified + mov r0,r8 + .syntax divided + adc r5,r0 + mov r0,#0 + adc r6,r0 + adc r7,r0 + .syntax unified + mov r0,r10 + .syntax divided + // END: sqr 128 Refined Karatsuba + // Result in r0 ... r7 + .syntax unified + mov r8,r4 + mov r9,r5 + mov r10,r6 + mov r11,r7 + .syntax divided + pop {r4,r5,r6,r7} + add r0,r4 + adc r1,r5 + adc r2,r6 + adc r3,r7 + .syntax unified + mov r4,r8 + mov r5,r9 + mov r6,r10 + mov r7,r11 + mov r8,r0 + .syntax divided + mov r0,#0 + adc r4,r0 + adc r5,r0 + adc r6,r0 + adc r7,r0 + .syntax unified + mov r0,r8 + .syntax divided + push {r0,r1,r2,r3,r4,r5,r6,r7} + ldr r4,[SP,#52] + ldm r4,{r0,r1,r2,r3,r4,r5,r6,r7} + sub r4,r0 + sbc r5,r1 + sbc r6,r2 + sbc r7,r3 + sbc r0,r0 + eor r4,r0 + eor r5,r0 + eor r6,r0 + eor r7,r0 + sub r4,r0 + sbc r5,r0 + sbc r6,r0 + sbc r7,r0 + // sqr 128 Refined Karatsuba + // Input in r4 ... r7 + // Result in r0 ... r7 + // clobbers all registers except for r14 + .syntax unified + mov r0,r4 + mov r1,r5 + .syntax divided + sub r0,r6 + sbc r1,r7 + sbc r2,r2 + eor r0,r2 + eor r1,r2 + sub r0,r2 + sbc r1,r2 + .syntax unified + mov r8,r0 + mov r9,r1 + mov r10,r6 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r4,r5 + // Result in r0,r1,r2,r3 + // Clobbers: r4-r6 + // START: sqr 32 + // Input operand in r4 + // Result in r0 ,r1 + // Clobbers: r2, r3 + uxth r0,r4 + lsr r1,r4,#16 + .syntax unified + mov r2,r0 + .syntax divided + mul r2,r1 + mul r0,r0 + mul r1,r1 + lsr r3,r2,#15 + lsl r2,r2,#17 + add r0,r2 + adc r1,r3 + // End: sqr 32 + // Result in r0 ,r1 + sub r4,r5 + sbc r6,r6 + eor r4,r6 + sub r4,r6 + // START: sqr 32 + // Input operand in r5 + // Result in r2 ,r3 + // Clobbers: r5, r6 + uxth r2,r5 + lsr r3,r5,#16 + .syntax unified + mov r5,r2 + .syntax divided + mul r5,r3 + mul r2,r2 + mul r3,r3 + lsr r6,r5,#15 + lsl r5,r5,#17 + add r2,r5 + adc r3,r6 + // End: sqr 32 + // Result in r2 ,r3 + mov r6,#0 + add r2,r1 + adc r3,r6 + // START: sqr 32 + // Input operand in r4 + // Result in r4 ,r5 + // Clobbers: r1, r6 + lsr r5,r4,#16 + uxth r4,r4 + .syntax unified + mov r1,r4 + .syntax divided + mul r1,r5 + mul r4,r4 + mul r5,r5 + lsr r6,r1,#15 + lsl r1,r1,#17 + add r4,r1 + adc r5,r6 + // End: sqr 32 + // Result in r4 ,r5 + .syntax unified + mov r1,r2 + .syntax divided + sub r1,r4 + sbc r2,r5 + .syntax unified + mov r5,r3 + .syntax divided + mov r6,#0 + sbc r3,r6 + add r1,r0 + adc r2,r5 + adc r3,r6 + // END: sqr 64 Refined Karatsuba + // Result in r0,r1,r2,r3 + // Leaves r6 zero. + .syntax unified + mov r6,r10 + mov r10,r0 + mov r11,r1 + mov r12,r2 + mov r1,r3 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r6,r7 + // Result in r2,r3,r4,r5 + // Clobbers: r0,r7,r6 + // START: sqr 32 + // Input operand in r6 + // Result in r2 ,r3 + // Clobbers: r4, r5 + uxth r2,r6 + lsr r3,r6,#16 + .syntax unified + mov r4,r2 + .syntax divided + mul r4,r3 + mul r2,r2 + mul r3,r3 + lsr r5,r4,#15 + lsl r4,r4,#17 + add r2,r4 + adc r3,r5 + // End: sqr 32 + // Result in r2 ,r3 + sub r6,r7 + sbc r4,r4 + eor r6,r4 + sub r6,r4 + // START: sqr 32 + // Input operand in r7 + // Result in r4 ,r5 + // Clobbers: r0, r7 + uxth r4,r7 + lsr r5,r7,#16 + .syntax unified + mov r0,r4 + .syntax divided + mul r0,r5 + mul r4,r4 + mul r5,r5 + lsr r7,r0,#15 + lsl r0,r0,#17 + add r4,r0 + adc r5,r7 + // End: sqr 32 + // Result in r4 ,r5 + mov r7,#0 + add r4,r3 + adc r5,r7 + // START: sqr 32 + // Input operand in r6 + // Result in r7 ,r0 + // Clobbers: r6, r3 + uxth r7,r6 + lsr r0,r6,#16 + .syntax unified + mov r6,r7 + .syntax divided + mul r6,r0 + mul r7,r7 + mul r0,r0 + lsr r3,r6,#15 + lsl r6,r6,#17 + add r7,r6 + adc r0,r3 + // End: sqr 32 + // Result in r7 ,r0 + .syntax unified + mov r3,r4 + .syntax divided + sub r3,r7 + sbc r4,r0 + .syntax unified + mov r0,r5 + .syntax divided + mov r6,#0 + sbc r5,r6 + add r3,r2 + adc r4,r0 + adc r5,r6 + // END: sqr 64 Refined Karatsuba + // Result in r2,r3,r4,r5 + // Leaves r6 zero. + .syntax unified + mov r0,r12 + .syntax divided + add r2,r0 + adc r3,r1 + adc r4,r6 + adc r5,r6 + .syntax unified + mov r12,r2 + mov r2,r8 + mov r8,r3 + mov r3,r9 + mov r9,r4 + .syntax divided + // START: sqr 64 Refined Karatsuba + // Input operands in r2,r3 + // Result in r6,r7,r0,r1 + // Clobbers: r2,r3,r4 + // START: sqr 32 + // Input operand in r2 + // Result in r6 ,r7 + // Clobbers: r0, r1 + uxth r6,r2 + lsr r7,r2,#16 + .syntax unified + mov r0,r6 + .syntax divided + mul r0,r7 + mul r6,r6 + mul r7,r7 + lsr r1,r0,#15 + lsl r0,r0,#17 + add r6,r0 + adc r7,r1 + // End: sqr 32 + // Result in r6 ,r7 + sub r2,r3 + sbc r4,r4 + eor r2,r4 + sub r2,r4 + // START: sqr 32 + // Input operand in r3 + // Result in r0 ,r1 + // Clobbers: r3, r4 + uxth r0,r3 + lsr r1,r3,#16 + .syntax unified + mov r3,r0 + .syntax divided + mul r3,r1 + mul r0,r0 + mul r1,r1 + lsr r4,r3,#15 + lsl r3,r3,#17 + add r0,r3 + adc r1,r4 + // End: sqr 32 + // Result in r0 ,r1 + mov r4,#0 + add r0,r7 + adc r1,r4 + // START: sqr 32 + // Input operand in r2 + // Result in r3 ,r4 + // Clobbers: r2, r7 + uxth r3,r2 + lsr r4,r2,#16 + .syntax unified + mov r2,r3 + .syntax divided + mul r2,r4 + mul r3,r3 + mul r4,r4 + lsr r7,r2,#15 + lsl r2,r2,#17 + add r3,r2 + adc r4,r7 + // End: sqr 32 + // Result in r3 ,r4 + .syntax unified + mov r7,r0 + .syntax divided + sub r7,r3 + sbc r0,r4 + .syntax unified + mov r2,r1 + .syntax divided + mov r4,#0 + sbc r1,r4 + add r7,r6 + adc r0,r2 + adc r1,r4 + // END: sqr 64 Refined Karatsuba + // Result in r6,r7,r0,r1 + // Returns r4 as zero. + .syntax unified + mov r2,r12 + mov r3,r8 + mov r4,r9 + .syntax divided + sub r2,r6 + sbc r3,r7 + .syntax unified + mov r6,r4 + mov r7,r5 + .syntax divided + sbc r4,r0 + sbc r5,r1 + mov r0,#0 + sbc r6,r0 + sbc r7,r0 + .syntax unified + mov r0,r10 + .syntax divided + add r2,r0 + .syntax unified + mov r1,r11 + .syntax divided + adc r3,r1 + .syntax unified + mov r0,r12 + .syntax divided + adc r4,r0 + .syntax unified + mov r0,r8 + .syntax divided + adc r5,r0 + mov r0,#0 + adc r6,r0 + adc r7,r0 + .syntax unified + mov r0,r10 + .syntax divided + // END: sqr 128 Refined Karatsuba + // Result in r0 ... r7 + mvn r0,r0 + mvn r1,r1 + mvn r2,r2 + mvn r3,r3 + mvn r4,r4 + mvn r5,r5 + mvn r6,r6 + mvn r7,r7 + .syntax unified + mov r8,r4 + mov r9,r5 + mov r10,r6 + mov r11,r7 + .syntax divided + mov r4,#143 + asr r4,r4,#1 + pop {r4,r5,r6,r7} + adc r0,r4 + adc r1,r5 + adc r2,r6 + adc r3,r7 + .syntax unified + mov r12,r4 + .syntax divided + mov r4,#16 + add r4,r14 + stm r4!,{r0,r1,r2,r3} + .syntax unified + mov r4,r12 + mov r0,r8 + .syntax divided + adc r0,r4 + .syntax unified + mov r8,r0 + mov r1,r9 + .syntax divided + adc r1,r5 + .syntax unified + mov r9,r1 + mov r2,r10 + .syntax divided + adc r2,r6 + .syntax unified + mov r10,r2 + mov r3,r11 + .syntax divided + adc r3,r7 + .syntax unified + mov r11,r3 + .syntax divided + mov r0,#0 + adc r0,r0 + .syntax unified + mov r12,r0 + mov r0,r14 + .syntax divided + ldm r0,{r0,r1,r2,r3,r4,r5,r6,r7} + add r0,r4 + adc r1,r5 + adc r2,r6 + adc r3,r7 + mov r4,#16 + add r4,r14 + stm r4!,{r0,r1,r2,r3} + .syntax unified + mov r14,r4 + mov r0,r13 + .syntax divided + ldm r0!,{r4,r5,r6,r7} + .syntax unified + mov r1,r8 + .syntax divided + adc r4,r1 + .syntax unified + mov r1,r9 + .syntax divided + adc r5,r1 + .syntax unified + mov r1,r10 + .syntax divided + adc r6,r1 + .syntax unified + mov r1,r11 + .syntax divided + adc r7,r1 + .syntax unified + mov r0,r14 + .syntax divided + stm r0!,{r4,r5,r6,r7} + pop {r4,r5,r6,r7} + .syntax unified + mov r1,r12 + .syntax divided + mov r2,#0 + mvn r2,r2 + adc r1,r2 + asr r2,r1,#4 + add r4,r1 + adc r5,r2 + adc r6,r2 + adc r7,r2 + stm r0!,{r4,r5,r6,r7} + pop {r3,r4,r5,r6,r7} + .syntax unified + mov r8,r3 + mov r9,r4 + mov r10,r5 + mov r11,r6 + mov r12,r7 + .syntax divided + pop {r0,r4,r5,r6,r7,r15} +//Cycle Count ASM-Version of 256 sqr (Refined Karatsuba) (Cortex M0): 793 (697 instructions). + .size square256_asm, .-square256_asm -- cgit v1.2.1