summaryrefslogtreecommitdiff
path: root/x86_64
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-01-27 22:10:50 +0100
committerNiels Möller <nisse@lysator.liu.se>2022-01-27 22:10:50 +0100
commitf3656a4408aae5db4994674b85169fb7cbcd42b1 (patch)
tree11daa8af4a3c875fe60618da293d64194ed80d99 /x86_64
parentb7268727a11bce0a350345c2671493d2ddd28b45 (diff)
downloadnettle-f3656a4408aae5db4994674b85169fb7cbcd42b1.tar.gz
x86_64: Rewrite of poly1305 assembly.
Diffstat (limited to 'x86_64')
-rw-r--r--x86_64/poly1305-internal.asm204
1 files changed, 113 insertions, 91 deletions
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm
index 1e2c60b6..ef2f38e4 100644
--- a/x86_64/poly1305-internal.asm
+++ b/x86_64/poly1305-internal.asm
@@ -33,153 +33,175 @@ ifelse(`
.file "poly1305-internal.asm"
C Registers mainly used by poly1305_block
-define(`CTX', `%rdi')
-define(`T0', `%rcx')
-define(`T1', `%rsi')
-define(`T2', `%r8')
-define(`H0', `%r9')
-define(`H1', `%r10')
-define(`H2', `%r11')
-
+define(`CTX', `%rdi') C First argument to all functions
+
+define(`KEY', `%rsi')
+define(`MASK',` %r8')
C _poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
.text
- C Registers:
- C %rdi: ctx
- C %rsi: key
- C %r8: mask
ALIGN(16)
PROLOGUE(_nettle_poly1305_set_key)
W64_ENTRY(2,0)
- mov $0x0ffffffc0fffffff, %r8
- mov (%rsi), %rax
- and %r8, %rax
- and $-4, %r8
- mov %rax, (CTX)
- mov 8(%rsi), %rax
- and %r8, %rax
+ mov $0x0ffffffc0fffffff, MASK
+ mov (KEY), %rax
+ and MASK, %rax
+ and $-4, MASK
+ mov %rax, P1305_R0 (CTX)
+ imul $5, %rax
+ mov %rax, P1305_S0 (CTX) C 5*R0
+ mov 8(KEY), %rax
+ and MASK, %rax
mov %rax, P1305_R1 (CTX)
shr $2, %rax
imul $5, %rax
- mov %rax, P1305_S1 (CTX)
+ mov %rax, P1305_S1 (CTX) C 5*(R1>>2)
xor XREG(%rax), XREG(%rax)
mov %rax, P1305_H0 (CTX)
mov %rax, P1305_H1 (CTX)
- mov XREG(%rax), P1305_H2 (CTX)
+ mov %rax, P1305_H2 (CTX)
W64_EXIT(2,0)
ret
+undefine(`KEY')
+undefine(`MASK')
+
EPILOGUE(_nettle_poly1305_set_key)
-C 64-bit multiplication mod 2^130 - 5
+define(`T0', `%rcx')
+define(`T1', `%rsi') C Overlaps message input pointer.
+define(`T2', `%r8')
+define(`H0', `%r9')
+define(`H1', `%r10')
+define(`F0', `%r11')
+define(`F1', `%r12')
+
+C First accumulate the independent products
+C
+C {H1,H0} = R0 T0 + S1 T1 + S0 (T2 >> 2)
+C {F1,F0} = R1 T0 + R0 T1 + S1 T2
+C T = R0 * (T2 & 3)
C
-C (x_0 + B x_1 + B^2 x_2) * (r_0 + B r_1) =
-C 1 B B^2 B^3
-C x_0 r_0
-C x_0 r_1
-C x_1 r_0
-C x_1 r_1
-C x_2 r_0
-C x_2 r_1
-C Then r_1 B^2 = r_1/4 (2^130) = 5/4 r_1.
-C and r_1 B^3 = 5/4 B r_1
-C So we get
+C Then add together as
C
-C x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0)
-C 1 B B^2 B^3
-C x_0 r_0
-C x_1 r'_1
-C x_0 r_1
-C x_1 r_0
-C x_2 r'_1
-C x_2 r_0
+C +--+--+--+
+C |T |H1|H0|
+C +--+--+--+
+C + |F1|F0|
+C --+--+--+--+
+C |H2|H1|H0|
+C +--+--+--+
C _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi)
PROLOGUE(_nettle_poly1305_block)
W64_ENTRY(3, 0)
+ push %r12
mov (%rsi), T0
mov 8(%rsi), T1
- mov XREG(%rdx), XREG(T2)
-
- C Registers:
- C Inputs: CTX, T0, T1, T2,
- C Outputs: H0, H1, H2, stored into the context.
+ mov XREG(%rdx), XREG(T2) C Also zero extends
add P1305_H0 (CTX), T0
adc P1305_H1 (CTX), T1
- adc P1305_H2 (CTX), XREG(T2)
- mov P1305_R0 (CTX), %rax
- mul T0 C x0*r0
+ adc P1305_H2 (CTX), T2
+
+ mov P1305_R1 (CTX), %rax
+ mul T0 C R1 T0
+ mov %rax, F0
+ mov %rdx, F1
+
+ mov T0, %rax C Last use of T0 input
+ mov P1305_R0 (CTX), T0
+ mul T0 C R0*T0
mov %rax, H0
mov %rdx, H1
- mov P1305_S1 (CTX), %rax C 5/4 r1
- mov %rax, H2
- mul T1 C x1*r1'
- imul T2, H2 C x2*r1'
- imul P1305_R0 (CTX), T2 C x2*r0
+
+ mov T1, %rax
+ mul T0 C R0*T1
+ add %rax, F0
+ adc %rdx, F1
+
+ mov P1305_S1 (CTX), T0
+ mov T1, %rax C Last use of T1 input
+ mul T0 C S1*T1
add %rax, H0
adc %rdx, H1
- mov P1305_R0 (CTX), %rax
- mul T1 C x1*r0
- add %rax, H2
- adc %rdx, T2
- mov P1305_R1 (CTX), %rax
- mul T0 C x0*r1
- add %rax, H2
- adc %rdx, T2
+
mov T2, %rax
- shr $2, %rax
- imul $5, %rax
- and $3, XREG(T2)
+ mul T0 C S1*T2
+ add %rax, F0
+ adc %rdx, F1
+
+ mov $3, XREG(T1)
+ and T2, T1
+
+ shr $2, T2
+ mov P1305_S0 (CTX), %rax
+ mul T2 C S0*(T2 >> 2)
add %rax, H0
- adc H2, H1
- adc $0, XREG(T2)
+ adc %rdx, H1
+
+ imul P1305_R0 (CTX), T1 C R0*(T2 & 3)
+ add F0, H1
+ adc T1, F1
+
mov H0, P1305_H0 (CTX)
mov H1, P1305_H1 (CTX)
- mov XREG(T2), P1305_H2 (CTX)
+ mov F1, P1305_H2 (CTX)
+ pop %r12
W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_poly1305_block)
+undefine(`T0')
+undefine(`T1')
+undefine(`T2')
+undefine(`H0')
+undefine(`H1')
+undefine(`F0')
+undefine(`F1')
C _poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s)
- C Registers:
- C %rdi: ctx
- C %rsi: s
-
+define(`S', `%rsi')
+
+define(`T0', `%rcx')
+define(`T1', `%r8')
+define(`H0', `%r9')
+define(`H1', `%r10')
+define(`F0', `%r11')
+define(`F1', `%rrd') C Overlaps CTX
+
PROLOGUE(_nettle_poly1305_digest)
W64_ENTRY(2, 0)
mov P1305_H0 (CTX), H0
mov P1305_H1 (CTX), H1
- mov P1305_H2 (CTX), XREG(H2)
- mov XREG(H2), XREG(%rax)
- shr $2, XREG(%rax)
- and $3, H2
- imul $5, XREG(%rax)
- add %rax, H0
+ mov P1305_H2 (CTX), F0
+
+ xor XREG(%rax), XREG(%rax)
+ mov %rax, P1305_H0 (CTX)
+ mov %rax, P1305_H1 (CTX)
+ mov %rax, P1305_H2 (CTX)
+
+ mov $3, XREG(%rax)
+ and XREG(F0), XREG(%rax)
+ shr $2, F0
+ imul $5, F0
+ add F0, H0
adc $0, H1
- adc $0, XREG(H2)
+ adc $0, XREG(%rax)
-C Use %rax instead of %rsi
-define(`T1', `%rax')
C Add 5, use result if >= 2^130
mov $5, T0
xor T1, T1
add H0, T0
adc H1, T1
- adc $0, XREG(H2)
- cmp $4, XREG(H2)
- cmovnc T0, H0
- cmovnc T1, H1
+ adc $-4, XREG(%rax) C Carry if %rax + c >= 4
+ cmovc T0, H0
+ cmovc T1, H1
- add H0, (%rsi)
- adc H1, 8(%rsi)
+ add H0, (S)
+ adc H1, 8(S)
- xor XREG(%rax), XREG(%rax)
- mov %rax, P1305_H0 (CTX)
- mov %rax, P1305_H1 (CTX)
- mov XREG(%rax), P1305_H2 (CTX)
W64_EXIT(2, 0)
ret
EPILOGUE(_nettle_poly1305_digest)