summaryrefslogtreecommitdiff
path: root/x86_64
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2021-12-09 21:39:09 +0100
committerNiels Möller <nisse@lysator.liu.se>2022-01-04 20:25:12 +0100
commitf57640ead79484297635ebbddb8e8952cd42f395 (patch)
treea8acf430cf62f4019e5426e63b1b2c944ea87be0 /x86_64
parentdd65a63e7453750506144e5caeb6e159165e1bc2 (diff)
downloadnettle-f57640ead79484297635ebbddb8e8952cd42f395.tar.gz
x86_64: Improved ecc_secp256r1_redc
* x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less carry propagation phase, and fewer registers, avoiding save and restore of callee-save registers. 17% speedup of this function on AMD Ryzen 5, resulting in a modest improvement in ecdsa performance.
Diffstat (limited to 'x86_64')
-rw-r--r--x86_64/ecc-secp256r1-redc.asm102
1 files changed, 45 insertions, 57 deletions
diff --git a/x86_64/ecc-secp256r1-redc.asm b/x86_64/ecc-secp256r1-redc.asm
index 91f55225..bffa7726 100644
--- a/x86_64/ecc-secp256r1-redc.asm
+++ b/x86_64/ecc-secp256r1-redc.asm
@@ -1,7 +1,7 @@
C x86_64/ecc-secp256r1-redc.asm
ifelse(`
- Copyright (C) 2013 Niels Möller
+ Copyright (C) 2013, 2021 Niels Möller
This file is part of GNU Nettle.
@@ -39,69 +39,61 @@ define(`U0', `%rdi') C Overlaps unused modulo input
define(`U1', `%rcx')
define(`U2', `%rax')
define(`U3', `%r8')
-define(`U4', `%r9')
-define(`U5', `%r10')
-define(`U6', `%r11')
-define(`F0', `%r12')
-define(`F1', `%r13')
-define(`F2', `%rbx')
-define(`F3', `%rbp')
-
-C FOLD(x), sets (F3,F2,F1,F0) <-- (x << 160) - (x << 128) - (x<<32)
+define(`F0', `%r9')
+define(`F1', `%r10')
+define(`F2', `%r11')
+define(`F3', `%rdx') C Overlap XP, used only in final carry folding
+
+C FOLD(x), sets (x,F2,F1,F0 ) <-- (x << 192) - (x << 160) + (x << 128) + (x << 32)
define(`FOLD', `
+ mov $1, F0
+ mov $1, F1
+ mov $1, F2
+ shl `$'32, F0
+ shr `$'32, F1
+ sub F0, F2
+ sbb F1, $1
+')
+C FOLDC(x), sets (x,F2,F1,F0) <-- ((x+c) << 192) - (x << 160) + (x << 128) + (x << 32)
+define(`FOLDC', `
+ mov $1, F0
+ mov $1, F1
mov $1, F2
- mov $1, F3
- shl `$'32, F2
- shr `$'32, F3
- xor F0,F0
- xor F1,F1
- sub F2, F0
- sbb F3, F1
- sbb $1, F2
- sbb `$'0, F3
+ adc `$'0, $1 C May overflow, but final result will not.
+ shl `$'32, F0
+ shr `$'32, F1
+ sub F0, F2
+ sbb F1, $1
')
PROLOGUE(_nettle_ecc_secp256r1_redc)
W64_ENTRY(3, 0)
- C save all registers that need to be saved
- push %rbx
- push %rbp
- push %r12
- push %r13
mov (XP), U0
FOLD(U0)
mov 8(XP), U1
mov 16(XP), U2
mov 24(XP), U3
- sub F0, U1
- sbb F1, U2
- sbb F2, U3
- sbb F3, U0 C Add in later
-
- FOLD(U1)
- mov 32(XP), U4
- sub F0, U2
- sbb F1, U3
- sbb F2, U4
- sbb F3, U1
-
- FOLD(U2)
- mov 40(XP), U5
- sub F0, U3
- sbb F1, U4
- sbb F2, U5
- sbb F3, U2
-
- FOLD(U3)
- mov 48(XP), U6
- sub F0, U4
- sbb F1, U5
- sbb F2, U6
- sbb F3, U3
-
- add U4, U0
- adc U5, U1
- adc U6, U2
+ add F0, U1
+ adc F1, U2
+ adc F2, U3
+ adc 32(XP), U0
+
+ FOLDC(U1)
+ add F0, U2
+ adc F1, U3
+ adc F2, U0
+ adc 40(XP), U1
+
+ FOLDC(U2)
+ add F0, U3
+ adc F1, U0
+ adc F2, U1
+ adc 48(XP), U2
+
+ FOLDC(U3)
+ add F0, U0
+ adc F1, U1
+ adc F2, U2
adc 56(XP), U3
C Sum, including carry, is < 2^{256} + p.
@@ -126,10 +118,6 @@ PROLOGUE(_nettle_ecc_secp256r1_redc)
mov U3, 24(RP)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_secp256r1_redc)