diff options
author | Niels Möller <nisse@lysator.liu.se> | 2020-10-30 20:40:36 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2020-10-30 20:40:36 +0100 |
commit | 82dc13c952067e1d6d0af818bf5cc76f558da73b (patch) | |
tree | a69209d3c2e31e6c43e2fc98805b163f9291eff9 | |
parent | f4f5625edb7d899972431b838ac19ced9288f68a (diff) | |
download | nettle-82dc13c952067e1d6d0af818bf5cc76f558da73b.tar.gz |
Update x86_64 assembly mod functions
-rw-r--r-- | x86_64/ecc-curve25519-modp.asm | 10 | ||||
-rw-r--r-- | x86_64/ecc-curve448-modp.asm | 59 | ||||
-rw-r--r-- | x86_64/ecc-secp192r1-modp.asm | 20 | ||||
-rw-r--r-- | x86_64/ecc-secp224r1-modp.asm | 32 | ||||
-rw-r--r-- | x86_64/ecc-secp384r1-modp.asm | 56 | ||||
-rw-r--r-- | x86_64/ecc-secp521r1-modp.asm | 62 |
6 files changed, 131 insertions, 108 deletions
diff --git a/x86_64/ecc-curve25519-modp.asm b/x86_64/ecc-curve25519-modp.asm index 713fbf3b..3e48e9ac 100644 --- a/x86_64/ecc-curve25519-modp.asm +++ b/x86_64/ecc-curve25519-modp.asm @@ -33,6 +33,7 @@ ifelse(` .file "ecc-25519-modp.asm" define(`RP', `%rsi') +define(`XP', `%rdx') C Overlaps with mul register define(`U0', `%rdi') C Overlaps unused modulo input define(`U1', `%rcx') define(`U2', `%r8') @@ -42,8 +43,10 @@ define(`T1', `%r11') define(`M', `%rbx') PROLOGUE(_nettle_ecc_curve25519_modp) - W64_ENTRY(2, 0) + W64_ENTRY(3, 0) push %rbx + push RP + mov XP, RP C First fold the limbs affecting bit 255 mov 56(RP), %rax @@ -79,6 +82,9 @@ PROLOGUE(_nettle_ecc_curve25519_modp) mov 48(RP), %rax mov %rdx, T1 mul M + + pop RP + add T0, U0 mov U0, (RP) adc T1, U1 @@ -89,6 +95,6 @@ PROLOGUE(_nettle_ecc_curve25519_modp) mov U3, 24(RP) pop %rbx - W64_EXIT(2, 0) + W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_curve25519_modp) diff --git a/x86_64/ecc-curve448-modp.asm b/x86_64/ecc-curve448-modp.asm index 52ec4b7d..44c3bf3e 100644 --- a/x86_64/ecc-curve448-modp.asm +++ b/x86_64/ecc-curve448-modp.asm @@ -33,37 +33,39 @@ ifelse(` .file "ecc-curve448-modp.asm" define(`RP', `%rsi') +define(`XP', `%rdx') define(`X0', `%rax') define(`X1', `%rbx') define(`X2', `%rcx') -define(`X3', `%rdx') -define(`X4', `%rbp') -define(`X5', `%rdi') -define(`X6', `%r8') -define(`X7', `%r9') -define(`T0', `%r10') -define(`T1', `%r11') -define(`T2', `%r12') +define(`X3', `%rbp') +define(`X4', `%rdi') +define(`X5', `%r8') +define(`X6', `%r9') +define(`X7', `%r10') +define(`T0', `%r11') +define(`T1', `%r12') +define(`T2', `%r13') PROLOGUE(_nettle_ecc_curve448_modp) - W64_ENTRY(2, 0) + W64_ENTRY(3, 0) push %rbx push %rbp push %r12 + push %r13 C First load the values to be shifted by 32. - mov 88(RP), X1 + mov 88(XP), X1 mov X1, X0 - mov 96(RP), X2 + mov 96(XP), X2 mov X1, T0 - mov 104(RP), X3 + mov 104(XP), X3 mov X2, T1 - mov 56(RP), X4 + mov 56(XP), X4 mov X3, T2 - mov 64(RP), X5 - mov 72(RP), X6 - mov 80(RP), X7 + mov 64(XP), X5 + mov 72(XP), X6 + mov 80(XP), X7 C Multiply by 2^32 shl $32, X0 @@ -82,22 +84,22 @@ PROLOGUE(_nettle_ecc_curve448_modp) adc $0, X7 C Main additions - add 56(RP), X0 - adc 64(RP), X1 - adc 72(RP), X2 - adc 80(RP), X3 + add 56(XP), X0 + adc 64(XP), X1 + adc 72(XP), X2 + adc 80(XP), X3 adc T0, X4 adc T1, X5 adc T2, X6 adc $0, X7 - add (RP), X0 - adc 8(RP), X1 - adc 16(RP), X2 - adc 24(RP), X3 - adc 32(RP), X4 - adc 40(RP), X5 - adc 48(RP), X6 + add (XP), X0 + adc 8(XP), X1 + adc 16(XP), X2 + adc 24(XP), X3 + adc 32(XP), X4 + adc 40(XP), X5 + adc 48(XP), X6 adc $0, X7 C X7 wraparound @@ -135,10 +137,11 @@ PROLOGUE(_nettle_ecc_curve448_modp) adc $0, X6 mov X6, 48(RP) + pop %r13 pop %r12 pop %rbp pop %rbx - W64_EXIT(2, 0) + W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_curve448_modp) diff --git a/x86_64/ecc-secp192r1-modp.asm b/x86_64/ecc-secp192r1-modp.asm index 8cdab01d..fa093609 100644 --- a/x86_64/ecc-secp192r1-modp.asm +++ b/x86_64/ecc-secp192r1-modp.asm @@ -33,9 +33,11 @@ ifelse(` .file "ecc-secp192r1-modp.asm" define(`RP', `%rsi') +define(`XP', `%rdx') + define(`T0', `%rdi') C Overlaps unused modulo input define(`T1', `%rcx') -define(`T2', `%rdx') +define(`T2', `%rax') define(`T3', `%r8') define(`H', `%r9') define(`C1', `%r10') @@ -45,10 +47,10 @@ define(`C2', `%r11') .text ALIGN(16) PROLOGUE(_nettle_ecc_secp192r1_modp) - W64_ENTRY(2, 0) - mov 16(RP), T2 - mov 24(RP), T3 - mov 40(RP), H + W64_ENTRY(3, 0) + mov 16(XP), T2 + mov 24(XP), T3 + mov 40(XP), H xor C1, C1 xor C2, C2 @@ -57,14 +59,14 @@ PROLOGUE(_nettle_ecc_secp192r1_modp) C Carry to be added in at T1 and T2 setc LREG(C2) - mov 8(RP), T1 - mov 32(RP), H + mov 8(XP), T1 + mov 32(XP), H adc H, T1 adc H, T2 C Carry to be added in at T0 and T1 setc LREG(C1) - mov (RP), T0 + mov (XP), T0 adc T3, T0 adc T3, T1 adc $0, C2 @@ -83,6 +85,6 @@ PROLOGUE(_nettle_ecc_secp192r1_modp) mov T1, 8(RP) mov T2, 16(RP) - W64_EXIT(2, 0) + W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_secp192r1_modp) diff --git a/x86_64/ecc-secp224r1-modp.asm b/x86_64/ecc-secp224r1-modp.asm index 34ab52f4..4a667a87 100644 --- a/x86_64/ecc-secp224r1-modp.asm +++ b/x86_64/ecc-secp224r1-modp.asm @@ -35,20 +35,23 @@ ifelse(` GMP_NUMB_BITS(64) define(`RP', `%rsi') -define(`T0', `%rdi') C Overlaps unused modulo input +define(`XP', `%rdx') +define(`T0', `%rdi') C Overlaps unused modulo input define(`T1', `%rcx') define(`H0', `%rax') -define(`H1', `%rdx') -define(`H2', `%r8') -define(`F0', `%r9') +define(`H1', `%r8') +define(`H2', `%r9') +define(`F0', `%rsi') C Overlaps RP define(`F1', `%r10') define(`F2', `%r11') C ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp) PROLOGUE(_nettle_ecc_secp224r1_modp) - W64_ENTRY(2, 0) - mov 48(RP), H0 - mov 56(RP), H1 + W64_ENTRY(3, 0) + push RP + + mov 48(XP), H0 + mov 56(XP), H1 C Set (F2,F1,F0) <-- (H1,H0) << 32 mov H0, F0 mov H0, F1 @@ -61,15 +64,15 @@ PROLOGUE(_nettle_ecc_secp224r1_modp) or T0, F1 xor H2, H2 - mov 16(RP), T0 - mov 24(RP), T1 + mov 16(XP), T0 + mov 24(XP), T1 sub F0, T0 sbb F1, T1 sbb F2, H0 sbb $0, H1 C No further borrow - adc 32(RP), H0 - adc 40(RP), H1 + adc 32(XP), H0 + adc 40(XP), H1 adc $0, H2 C Set (F2,F1,F0) <-- (H2,H1,H0) << 32 @@ -92,8 +95,8 @@ PROLOGUE(_nettle_ecc_secp224r1_modp) or T0, F1 or T1, F2 - mov (RP), T0 - mov 8(RP), T1 + mov (XP), T0 + mov 8(XP), T1 sub F0, T0 sbb F1, T1 sbb F2, H0 @@ -121,11 +124,12 @@ PROLOGUE(_nettle_ecc_secp224r1_modp) adc F2, H0 adc $0, H1 + pop RP mov T0, (RP) mov T1, 8(RP) mov H0, 16(RP) mov H1, 24(RP) - W64_EXIT(2, 0) + W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_secp224r1_modp) diff --git a/x86_64/ecc-secp384r1-modp.asm b/x86_64/ecc-secp384r1-modp.asm index 24b3d92d..fbc3a2fd 100644 --- a/x86_64/ecc-secp384r1-modp.asm +++ b/x86_64/ecc-secp384r1-modp.asm @@ -32,13 +32,17 @@ ifelse(` .file "ecc-secp384r1-modp.asm" +C Input arguments: +C %rdi (unused) define(`RP', `%rsi') +define(`XP', `%rdx') + define(`D5', `%rax') define(`T0', `%rbx') define(`T1', `%rcx') -define(`T2', `%rdx') +define(`T2', `%rdi') define(`T3', `%rbp') -define(`T4', `%rdi') +define(`T4', `%rsi') define(`T5', `%r8') define(`H0', `%r9') define(`H1', `%r10') @@ -48,11 +52,12 @@ define(`H4', `%r13') define(`H5', `%r14') define(`C2', `%r15') define(`C0', H5) C Overlap -define(`TMP', RP) C Overlap +define(`TMP', XP) C Overlap + C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp) PROLOGUE(_nettle_ecc_secp384r1_modp) - W64_ENTRY(2, 0) + W64_ENTRY(3, 0) push %rbx push %rbp @@ -61,6 +66,7 @@ PROLOGUE(_nettle_ecc_secp384r1_modp) push %r14 push %r15 + push RP C Output pointer C First get top 2 limbs, which need folding twice. C B^10 = B^6 + B^4 + 2^32 (B-1)B^4. C We handle the terms as follow: @@ -74,8 +80,8 @@ PROLOGUE(_nettle_ecc_secp384r1_modp) C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added C in. - mov 80(RP), H4 - mov 88(RP), H5 + mov 80(XP), H4 + mov 88(XP), H5 C Shift right 32 bits, into H1, H0 mov H4, H0 mov H5, H1 @@ -100,30 +106,28 @@ PROLOGUE(_nettle_ecc_secp384r1_modp) adc $0, C2 C Add in to high part - add 48(RP), H0 - adc 56(RP), H1 + add 48(XP), H0 + adc 56(XP), H1 adc $0, C2 C Do C2 later C +1 term - mov (RP), T0 + mov (XP), T0 add H0, T0 - mov 8(RP), T1 + mov 8(XP), T1 adc H1, T1 - mov 16(RP), T2 - mov 64(RP), H2 + mov 16(XP), T2 + mov 64(XP), H2 adc H2, T2 - mov 24(RP), T3 - mov 72(RP), H3 + mov 24(XP), T3 + mov 72(XP), H3 adc H3, T3 - mov 32(RP), T4 + mov 32(XP), T4 adc H4, T4 - mov 40(RP), T5 + mov 40(XP), T5 adc H5, T5 sbb C0, C0 neg C0 C FIXME: Switch sign of C0? - push RP - C +B^2 term add H0, T2 adc H1, T3 @@ -207,20 +211,20 @@ PROLOGUE(_nettle_ecc_secp384r1_modp) sub H1, H0 sbb $0, H1 - pop RP + pop XP C Original RP argument add H0, T0 - mov T0, (RP) + mov T0, (XP) adc H1, T1 - mov T1, 8(RP) + mov T1, 8(XP) adc C0, T2 - mov T2, 16(RP) + mov T2, 16(XP) adc $0, T3 - mov T3, 24(RP) + mov T3, 24(XP) adc $0, T4 - mov T4, 32(RP) + mov T4, 32(XP) adc $0, T5 - mov T5, 40(RP) + mov T5, 40(XP) pop %r15 pop %r14 @@ -229,6 +233,6 @@ PROLOGUE(_nettle_ecc_secp384r1_modp) pop %rbp pop %rbx - W64_EXIT(2, 0) + W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_secp384r1_modp) diff --git a/x86_64/ecc-secp521r1-modp.asm b/x86_64/ecc-secp521r1-modp.asm index 16727893..00955fb5 100644 --- a/x86_64/ecc-secp521r1-modp.asm +++ b/x86_64/ecc-secp521r1-modp.asm @@ -35,89 +35,92 @@ ifelse(` GMP_NUMB_BITS(64) define(`RP', `%rsi') +define(`XP', `%rdx') + define(`U0', `%rax') define(`U1', `%rbx') define(`U2', `%rcx') -define(`U3', `%rdx') -define(`U4', `%rbp') -define(`U5', `%rdi') -define(`U6', `%r8') -define(`U7', `%r9') -define(`U8', `%r10') -define(`U9', `%r11') -define(`T0', `%r12') -define(`T1', `%r13') +define(`U3', `%rbp') +define(`U4', `%rdi') +define(`U5', `%r8') +define(`U6', `%r9') +define(`U7', `%r10') +define(`U8', `%r11') +define(`U9', `%r12') +define(`T0', `%r13') +define(`T1', `%r14') PROLOGUE(_nettle_ecc_secp521r1_modp) - W64_ENTRY(2, 0) + W64_ENTRY(3, 0) push %rbx push %rbp push %r12 push %r13 + push %r14 C Read top 17 limbs, shift left 55 bits - mov 72(RP), U1 + mov 72(XP), U1 mov U1, U0 shl $55, U0 shr $9, U1 - mov 80(RP), U2 + mov 80(XP), U2 mov U2, T0 shr $9, U2 shl $55, T0 or T0, U1 - mov 88(RP), U3 + mov 88(XP), U3 mov U3, T0 shr $9, U3 shl $55, T0 or T0, U2 - mov 96(RP), U4 + mov 96(XP), U4 mov U4, T0 shr $9, U4 shl $55, T0 or T0, U3 - mov 104(RP), U5 + mov 104(XP), U5 mov U5, T0 shr $9, U5 shl $55, T0 or T0, U4 - mov 112(RP), U6 + mov 112(XP), U6 mov U6, T0 shr $9, U6 shl $55, T0 or T0, U5 - mov 120(RP), U7 + mov 120(XP), U7 mov U7, T0 shr $9, U7 shl $55, T0 or T0, U6 - mov 128(RP), U8 + mov 128(XP), U8 mov U8, T0 shr $9, U8 shl $55, T0 or T0, U7 - mov 136(RP), U9 + mov 136(XP), U9 mov U9, T0 shr $9, U9 shl $55, T0 or T0, U8 - add (RP), U0 - adc 8(RP), U1 - adc 16(RP), U2 - adc 24(RP), U3 - adc 32(RP), U4 - adc 40(RP), U5 - adc 48(RP), U6 - adc 56(RP), U7 - adc 64(RP), U8 + add (XP), U0 + adc 8(XP), U1 + adc 16(XP), U2 + adc 24(XP), U3 + adc 32(XP), U4 + adc 40(XP), U5 + adc 48(XP), U6 + adc 56(XP), U7 + adc 64(XP), U8 adc $0, U9 C Top limbs are <U9, U8>. Keep low 9 bits of 8, and fold the @@ -149,10 +152,11 @@ PROLOGUE(_nettle_ecc_secp521r1_modp) adc $0, U8 mov U8, 64(RP) + pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx - W64_EXIT(2, 0) + W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_secp521r1_modp) |