summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-10-30 20:40:36 +0100
committerNiels Möller <nisse@lysator.liu.se>2020-10-30 20:40:36 +0100
commit82dc13c952067e1d6d0af818bf5cc76f558da73b (patch)
treea69209d3c2e31e6c43e2fc98805b163f9291eff9
parentf4f5625edb7d899972431b838ac19ced9288f68a (diff)
downloadnettle-82dc13c952067e1d6d0af818bf5cc76f558da73b.tar.gz
Update x86_64 assembly mod functions
-rw-r--r--x86_64/ecc-curve25519-modp.asm10
-rw-r--r--x86_64/ecc-curve448-modp.asm59
-rw-r--r--x86_64/ecc-secp192r1-modp.asm20
-rw-r--r--x86_64/ecc-secp224r1-modp.asm32
-rw-r--r--x86_64/ecc-secp384r1-modp.asm56
-rw-r--r--x86_64/ecc-secp521r1-modp.asm62
6 files changed, 131 insertions, 108 deletions
diff --git a/x86_64/ecc-curve25519-modp.asm b/x86_64/ecc-curve25519-modp.asm
index 713fbf3b..3e48e9ac 100644
--- a/x86_64/ecc-curve25519-modp.asm
+++ b/x86_64/ecc-curve25519-modp.asm
@@ -33,6 +33,7 @@ ifelse(`
.file "ecc-25519-modp.asm"
define(`RP', `%rsi')
+define(`XP', `%rdx') C Overlaps with mul register
define(`U0', `%rdi') C Overlaps unused modulo input
define(`U1', `%rcx')
define(`U2', `%r8')
@@ -42,8 +43,10 @@ define(`T1', `%r11')
define(`M', `%rbx')
PROLOGUE(_nettle_ecc_curve25519_modp)
- W64_ENTRY(2, 0)
+ W64_ENTRY(3, 0)
push %rbx
+ push RP
+ mov XP, RP
C First fold the limbs affecting bit 255
mov 56(RP), %rax
@@ -79,6 +82,9 @@ PROLOGUE(_nettle_ecc_curve25519_modp)
mov 48(RP), %rax
mov %rdx, T1
mul M
+
+ pop RP
+
add T0, U0
mov U0, (RP)
adc T1, U1
@@ -89,6 +95,6 @@ PROLOGUE(_nettle_ecc_curve25519_modp)
mov U3, 24(RP)
pop %rbx
- W64_EXIT(2, 0)
+ W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_curve25519_modp)
diff --git a/x86_64/ecc-curve448-modp.asm b/x86_64/ecc-curve448-modp.asm
index 52ec4b7d..44c3bf3e 100644
--- a/x86_64/ecc-curve448-modp.asm
+++ b/x86_64/ecc-curve448-modp.asm
@@ -33,37 +33,39 @@ ifelse(`
.file "ecc-curve448-modp.asm"
define(`RP', `%rsi')
+define(`XP', `%rdx')
define(`X0', `%rax')
define(`X1', `%rbx')
define(`X2', `%rcx')
-define(`X3', `%rdx')
-define(`X4', `%rbp')
-define(`X5', `%rdi')
-define(`X6', `%r8')
-define(`X7', `%r9')
-define(`T0', `%r10')
-define(`T1', `%r11')
-define(`T2', `%r12')
+define(`X3', `%rbp')
+define(`X4', `%rdi')
+define(`X5', `%r8')
+define(`X6', `%r9')
+define(`X7', `%r10')
+define(`T0', `%r11')
+define(`T1', `%r12')
+define(`T2', `%r13')
PROLOGUE(_nettle_ecc_curve448_modp)
- W64_ENTRY(2, 0)
+ W64_ENTRY(3, 0)
push %rbx
push %rbp
push %r12
+ push %r13
C First load the values to be shifted by 32.
- mov 88(RP), X1
+ mov 88(XP), X1
mov X1, X0
- mov 96(RP), X2
+ mov 96(XP), X2
mov X1, T0
- mov 104(RP), X3
+ mov 104(XP), X3
mov X2, T1
- mov 56(RP), X4
+ mov 56(XP), X4
mov X3, T2
- mov 64(RP), X5
- mov 72(RP), X6
- mov 80(RP), X7
+ mov 64(XP), X5
+ mov 72(XP), X6
+ mov 80(XP), X7
C Multiply by 2^32
shl $32, X0
@@ -82,22 +84,22 @@ PROLOGUE(_nettle_ecc_curve448_modp)
adc $0, X7
C Main additions
- add 56(RP), X0
- adc 64(RP), X1
- adc 72(RP), X2
- adc 80(RP), X3
+ add 56(XP), X0
+ adc 64(XP), X1
+ adc 72(XP), X2
+ adc 80(XP), X3
adc T0, X4
adc T1, X5
adc T2, X6
adc $0, X7
- add (RP), X0
- adc 8(RP), X1
- adc 16(RP), X2
- adc 24(RP), X3
- adc 32(RP), X4
- adc 40(RP), X5
- adc 48(RP), X6
+ add (XP), X0
+ adc 8(XP), X1
+ adc 16(XP), X2
+ adc 24(XP), X3
+ adc 32(XP), X4
+ adc 40(XP), X5
+ adc 48(XP), X6
adc $0, X7
C X7 wraparound
@@ -135,10 +137,11 @@ PROLOGUE(_nettle_ecc_curve448_modp)
adc $0, X6
mov X6, 48(RP)
+ pop %r13
pop %r12
pop %rbp
pop %rbx
- W64_EXIT(2, 0)
+ W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_curve448_modp)
diff --git a/x86_64/ecc-secp192r1-modp.asm b/x86_64/ecc-secp192r1-modp.asm
index 8cdab01d..fa093609 100644
--- a/x86_64/ecc-secp192r1-modp.asm
+++ b/x86_64/ecc-secp192r1-modp.asm
@@ -33,9 +33,11 @@ ifelse(`
.file "ecc-secp192r1-modp.asm"
define(`RP', `%rsi')
+define(`XP', `%rdx')
+
define(`T0', `%rdi') C Overlaps unused modulo input
define(`T1', `%rcx')
-define(`T2', `%rdx')
+define(`T2', `%rax')
define(`T3', `%r8')
define(`H', `%r9')
define(`C1', `%r10')
@@ -45,10 +47,10 @@ define(`C2', `%r11')
.text
ALIGN(16)
PROLOGUE(_nettle_ecc_secp192r1_modp)
- W64_ENTRY(2, 0)
- mov 16(RP), T2
- mov 24(RP), T3
- mov 40(RP), H
+ W64_ENTRY(3, 0)
+ mov 16(XP), T2
+ mov 24(XP), T3
+ mov 40(XP), H
xor C1, C1
xor C2, C2
@@ -57,14 +59,14 @@ PROLOGUE(_nettle_ecc_secp192r1_modp)
C Carry to be added in at T1 and T2
setc LREG(C2)
- mov 8(RP), T1
- mov 32(RP), H
+ mov 8(XP), T1
+ mov 32(XP), H
adc H, T1
adc H, T2
C Carry to be added in at T0 and T1
setc LREG(C1)
- mov (RP), T0
+ mov (XP), T0
adc T3, T0
adc T3, T1
adc $0, C2
@@ -83,6 +85,6 @@ PROLOGUE(_nettle_ecc_secp192r1_modp)
mov T1, 8(RP)
mov T2, 16(RP)
- W64_EXIT(2, 0)
+ W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_secp192r1_modp)
diff --git a/x86_64/ecc-secp224r1-modp.asm b/x86_64/ecc-secp224r1-modp.asm
index 34ab52f4..4a667a87 100644
--- a/x86_64/ecc-secp224r1-modp.asm
+++ b/x86_64/ecc-secp224r1-modp.asm
@@ -35,20 +35,23 @@ ifelse(`
GMP_NUMB_BITS(64)
define(`RP', `%rsi')
-define(`T0', `%rdi') C Overlaps unused modulo input
+define(`XP', `%rdx')
+define(`T0', `%rdi') C Overlaps unused modulo input
define(`T1', `%rcx')
define(`H0', `%rax')
-define(`H1', `%rdx')
-define(`H2', `%r8')
-define(`F0', `%r9')
+define(`H1', `%r8')
+define(`H2', `%r9')
+define(`F0', `%rsi') C Overlaps RP
define(`F1', `%r10')
define(`F2', `%r11')
C ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp)
PROLOGUE(_nettle_ecc_secp224r1_modp)
- W64_ENTRY(2, 0)
- mov 48(RP), H0
- mov 56(RP), H1
+ W64_ENTRY(3, 0)
+ push RP
+
+ mov 48(XP), H0
+ mov 56(XP), H1
C Set (F2,F1,F0) <-- (H1,H0) << 32
mov H0, F0
mov H0, F1
@@ -61,15 +64,15 @@ PROLOGUE(_nettle_ecc_secp224r1_modp)
or T0, F1
xor H2, H2
- mov 16(RP), T0
- mov 24(RP), T1
+ mov 16(XP), T0
+ mov 24(XP), T1
sub F0, T0
sbb F1, T1
sbb F2, H0
sbb $0, H1 C No further borrow
- adc 32(RP), H0
- adc 40(RP), H1
+ adc 32(XP), H0
+ adc 40(XP), H1
adc $0, H2
C Set (F2,F1,F0) <-- (H2,H1,H0) << 32
@@ -92,8 +95,8 @@ PROLOGUE(_nettle_ecc_secp224r1_modp)
or T0, F1
or T1, F2
- mov (RP), T0
- mov 8(RP), T1
+ mov (XP), T0
+ mov 8(XP), T1
sub F0, T0
sbb F1, T1
sbb F2, H0
@@ -121,11 +124,12 @@ PROLOGUE(_nettle_ecc_secp224r1_modp)
adc F2, H0
adc $0, H1
+ pop RP
mov T0, (RP)
mov T1, 8(RP)
mov H0, 16(RP)
mov H1, 24(RP)
- W64_EXIT(2, 0)
+ W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_secp224r1_modp)
diff --git a/x86_64/ecc-secp384r1-modp.asm b/x86_64/ecc-secp384r1-modp.asm
index 24b3d92d..fbc3a2fd 100644
--- a/x86_64/ecc-secp384r1-modp.asm
+++ b/x86_64/ecc-secp384r1-modp.asm
@@ -32,13 +32,17 @@ ifelse(`
.file "ecc-secp384r1-modp.asm"
+C Input arguments:
+C %rdi (unused)
define(`RP', `%rsi')
+define(`XP', `%rdx')
+
define(`D5', `%rax')
define(`T0', `%rbx')
define(`T1', `%rcx')
-define(`T2', `%rdx')
+define(`T2', `%rdi')
define(`T3', `%rbp')
-define(`T4', `%rdi')
+define(`T4', `%rsi')
define(`T5', `%r8')
define(`H0', `%r9')
define(`H1', `%r10')
@@ -48,11 +52,12 @@ define(`H4', `%r13')
define(`H5', `%r14')
define(`C2', `%r15')
define(`C0', H5) C Overlap
-define(`TMP', RP) C Overlap
+define(`TMP', XP) C Overlap
+ C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp)
PROLOGUE(_nettle_ecc_secp384r1_modp)
- W64_ENTRY(2, 0)
+ W64_ENTRY(3, 0)
push %rbx
push %rbp
@@ -61,6 +66,7 @@ PROLOGUE(_nettle_ecc_secp384r1_modp)
push %r14
push %r15
+ push RP C Output pointer
C First get top 2 limbs, which need folding twice.
C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
C We handle the terms as follow:
@@ -74,8 +80,8 @@ PROLOGUE(_nettle_ecc_secp384r1_modp)
C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
C in.
- mov 80(RP), H4
- mov 88(RP), H5
+ mov 80(XP), H4
+ mov 88(XP), H5
C Shift right 32 bits, into H1, H0
mov H4, H0
mov H5, H1
@@ -100,30 +106,28 @@ PROLOGUE(_nettle_ecc_secp384r1_modp)
adc $0, C2
C Add in to high part
- add 48(RP), H0
- adc 56(RP), H1
+ add 48(XP), H0
+ adc 56(XP), H1
adc $0, C2 C Do C2 later
C +1 term
- mov (RP), T0
+ mov (XP), T0
add H0, T0
- mov 8(RP), T1
+ mov 8(XP), T1
adc H1, T1
- mov 16(RP), T2
- mov 64(RP), H2
+ mov 16(XP), T2
+ mov 64(XP), H2
adc H2, T2
- mov 24(RP), T3
- mov 72(RP), H3
+ mov 24(XP), T3
+ mov 72(XP), H3
adc H3, T3
- mov 32(RP), T4
+ mov 32(XP), T4
adc H4, T4
- mov 40(RP), T5
+ mov 40(XP), T5
adc H5, T5
sbb C0, C0
neg C0 C FIXME: Switch sign of C0?
- push RP
-
C +B^2 term
add H0, T2
adc H1, T3
@@ -207,20 +211,20 @@ PROLOGUE(_nettle_ecc_secp384r1_modp)
sub H1, H0
sbb $0, H1
- pop RP
+ pop XP C Original RP argument
add H0, T0
- mov T0, (RP)
+ mov T0, (XP)
adc H1, T1
- mov T1, 8(RP)
+ mov T1, 8(XP)
adc C0, T2
- mov T2, 16(RP)
+ mov T2, 16(XP)
adc $0, T3
- mov T3, 24(RP)
+ mov T3, 24(XP)
adc $0, T4
- mov T4, 32(RP)
+ mov T4, 32(XP)
adc $0, T5
- mov T5, 40(RP)
+ mov T5, 40(XP)
pop %r15
pop %r14
@@ -229,6 +233,6 @@ PROLOGUE(_nettle_ecc_secp384r1_modp)
pop %rbp
pop %rbx
- W64_EXIT(2, 0)
+ W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_secp384r1_modp)
diff --git a/x86_64/ecc-secp521r1-modp.asm b/x86_64/ecc-secp521r1-modp.asm
index 16727893..00955fb5 100644
--- a/x86_64/ecc-secp521r1-modp.asm
+++ b/x86_64/ecc-secp521r1-modp.asm
@@ -35,89 +35,92 @@ ifelse(`
GMP_NUMB_BITS(64)
define(`RP', `%rsi')
+define(`XP', `%rdx')
+
define(`U0', `%rax')
define(`U1', `%rbx')
define(`U2', `%rcx')
-define(`U3', `%rdx')
-define(`U4', `%rbp')
-define(`U5', `%rdi')
-define(`U6', `%r8')
-define(`U7', `%r9')
-define(`U8', `%r10')
-define(`U9', `%r11')
-define(`T0', `%r12')
-define(`T1', `%r13')
+define(`U3', `%rbp')
+define(`U4', `%rdi')
+define(`U5', `%r8')
+define(`U6', `%r9')
+define(`U7', `%r10')
+define(`U8', `%r11')
+define(`U9', `%r12')
+define(`T0', `%r13')
+define(`T1', `%r14')
PROLOGUE(_nettle_ecc_secp521r1_modp)
- W64_ENTRY(2, 0)
+ W64_ENTRY(3, 0)
push %rbx
push %rbp
push %r12
push %r13
+ push %r14
C Read top 17 limbs, shift left 55 bits
- mov 72(RP), U1
+ mov 72(XP), U1
mov U1, U0
shl $55, U0
shr $9, U1
- mov 80(RP), U2
+ mov 80(XP), U2
mov U2, T0
shr $9, U2
shl $55, T0
or T0, U1
- mov 88(RP), U3
+ mov 88(XP), U3
mov U3, T0
shr $9, U3
shl $55, T0
or T0, U2
- mov 96(RP), U4
+ mov 96(XP), U4
mov U4, T0
shr $9, U4
shl $55, T0
or T0, U3
- mov 104(RP), U5
+ mov 104(XP), U5
mov U5, T0
shr $9, U5
shl $55, T0
or T0, U4
- mov 112(RP), U6
+ mov 112(XP), U6
mov U6, T0
shr $9, U6
shl $55, T0
or T0, U5
- mov 120(RP), U7
+ mov 120(XP), U7
mov U7, T0
shr $9, U7
shl $55, T0
or T0, U6
- mov 128(RP), U8
+ mov 128(XP), U8
mov U8, T0
shr $9, U8
shl $55, T0
or T0, U7
- mov 136(RP), U9
+ mov 136(XP), U9
mov U9, T0
shr $9, U9
shl $55, T0
or T0, U8
- add (RP), U0
- adc 8(RP), U1
- adc 16(RP), U2
- adc 24(RP), U3
- adc 32(RP), U4
- adc 40(RP), U5
- adc 48(RP), U6
- adc 56(RP), U7
- adc 64(RP), U8
+ add (XP), U0
+ adc 8(XP), U1
+ adc 16(XP), U2
+ adc 24(XP), U3
+ adc 32(XP), U4
+ adc 40(XP), U5
+ adc 48(XP), U6
+ adc 56(XP), U7
+ adc 64(XP), U8
adc $0, U9
C Top limbs are <U9, U8>. Keep low 9 bits of 8, and fold the
@@ -149,10 +152,11 @@ PROLOGUE(_nettle_ecc_secp521r1_modp)
adc $0, U8
mov U8, 64(RP)
+ pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
- W64_EXIT(2, 0)
+ W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_secp521r1_modp)