diff options
author | Niels Möller <nisse@lysator.liu.se> | 2021-12-09 21:39:09 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2022-01-04 20:25:12 +0100 |
commit | f57640ead79484297635ebbddb8e8952cd42f395 (patch) | |
tree | a8acf430cf62f4019e5426e63b1b2c944ea87be0 /x86_64 | |
parent | dd65a63e7453750506144e5caeb6e159165e1bc2 (diff) | |
download | nettle-f57640ead79484297635ebbddb8e8952cd42f395.tar.gz |
x86_64: Improved ecc_secp256r1_redc
* x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less
carry propagation phase, and fewer registers, avoiding save and
restore of callee-save registers. 17% speedup of this function on
AMD Ryzen 5, resulting in a modest improvement in ecdsa
performance.
Diffstat (limited to 'x86_64')
-rw-r--r-- | x86_64/ecc-secp256r1-redc.asm | 102 |
1 files changed, 45 insertions, 57 deletions
diff --git a/x86_64/ecc-secp256r1-redc.asm b/x86_64/ecc-secp256r1-redc.asm index 91f55225..bffa7726 100644 --- a/x86_64/ecc-secp256r1-redc.asm +++ b/x86_64/ecc-secp256r1-redc.asm @@ -1,7 +1,7 @@ C x86_64/ecc-secp256r1-redc.asm ifelse(` - Copyright (C) 2013 Niels Möller + Copyright (C) 2013, 2021 Niels Möller This file is part of GNU Nettle. @@ -39,69 +39,61 @@ define(`U0', `%rdi') C Overlaps unused modulo input define(`U1', `%rcx') define(`U2', `%rax') define(`U3', `%r8') -define(`U4', `%r9') -define(`U5', `%r10') -define(`U6', `%r11') -define(`F0', `%r12') -define(`F1', `%r13') -define(`F2', `%rbx') -define(`F3', `%rbp') - -C FOLD(x), sets (F3,F2,F1,F0) <-- (x << 160) - (x << 128) - (x<<32) +define(`F0', `%r9') +define(`F1', `%r10') +define(`F2', `%r11') +define(`F3', `%rdx') C Overlap XP, used only in final carry folding + +C FOLD(x), sets (x,F2,F1,F0 ) <-- (x << 192) - (x << 160) + (x << 128) + (x << 32) define(`FOLD', ` + mov $1, F0 + mov $1, F1 + mov $1, F2 + shl `$'32, F0 + shr `$'32, F1 + sub F0, F2 + sbb F1, $1 +') +C FOLDC(x), sets (x,F2,F1,F0) <-- ((x+c) << 192) - (x << 160) + (x << 128) + (x << 32) +define(`FOLDC', ` + mov $1, F0 + mov $1, F1 mov $1, F2 - mov $1, F3 - shl `$'32, F2 - shr `$'32, F3 - xor F0,F0 - xor F1,F1 - sub F2, F0 - sbb F3, F1 - sbb $1, F2 - sbb `$'0, F3 + adc `$'0, $1 C May overflow, but final result will not. + shl `$'32, F0 + shr `$'32, F1 + sub F0, F2 + sbb F1, $1 ') PROLOGUE(_nettle_ecc_secp256r1_redc) W64_ENTRY(3, 0) - C save all registers that need to be saved - push %rbx - push %rbp - push %r12 - push %r13 mov (XP), U0 FOLD(U0) mov 8(XP), U1 mov 16(XP), U2 mov 24(XP), U3 - sub F0, U1 - sbb F1, U2 - sbb F2, U3 - sbb F3, U0 C Add in later - - FOLD(U1) - mov 32(XP), U4 - sub F0, U2 - sbb F1, U3 - sbb F2, U4 - sbb F3, U1 - - FOLD(U2) - mov 40(XP), U5 - sub F0, U3 - sbb F1, U4 - sbb F2, U5 - sbb F3, U2 - - FOLD(U3) - mov 48(XP), U6 - sub F0, U4 - sbb F1, U5 - sbb F2, U6 - sbb F3, U3 - - add U4, U0 - adc U5, U1 - adc U6, U2 + add F0, U1 + adc F1, U2 + adc F2, U3 + adc 32(XP), U0 + + FOLDC(U1) + add F0, U2 + adc F1, U3 + adc F2, U0 + adc 40(XP), U1 + + FOLDC(U2) + add F0, U3 + adc F1, U0 + adc F2, U1 + adc 48(XP), U2 + + FOLDC(U3) + add F0, U0 + adc F1, U1 + adc F2, U2 adc 56(XP), U3 C Sum, including carry, is < 2^{256} + p. @@ -126,10 +118,6 @@ PROLOGUE(_nettle_ecc_secp256r1_redc) mov U3, 24(RP) - pop %r13 - pop %r12 - pop %rbp - pop %rbx W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_secp256r1_redc) |