diff options
author | Niels Möller <nisse@lysator.liu.se> | 2022-01-27 22:10:50 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2022-01-27 22:10:50 +0100 |
commit | f3656a4408aae5db4994674b85169fb7cbcd42b1 (patch) | |
tree | 11daa8af4a3c875fe60618da293d64194ed80d99 /x86_64 | |
parent | b7268727a11bce0a350345c2671493d2ddd28b45 (diff) | |
download | nettle-f3656a4408aae5db4994674b85169fb7cbcd42b1.tar.gz |
x86_64: Rewrite of poly1305 assembly.
Diffstat (limited to 'x86_64')
-rw-r--r-- | x86_64/poly1305-internal.asm | 204 |
1 files changed, 113 insertions, 91 deletions
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm index 1e2c60b6..ef2f38e4 100644 --- a/x86_64/poly1305-internal.asm +++ b/x86_64/poly1305-internal.asm @@ -33,153 +33,175 @@ ifelse(` .file "poly1305-internal.asm" C Registers mainly used by poly1305_block -define(`CTX', `%rdi') -define(`T0', `%rcx') -define(`T1', `%rsi') -define(`T2', `%r8') -define(`H0', `%r9') -define(`H1', `%r10') -define(`H2', `%r11') - +define(`CTX', `%rdi') C First argument to all functions + +define(`KEY', `%rsi') +define(`MASK',` %r8') C _poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16]) .text - C Registers: - C %rdi: ctx - C %rsi: key - C %r8: mask ALIGN(16) PROLOGUE(_nettle_poly1305_set_key) W64_ENTRY(2,0) - mov $0x0ffffffc0fffffff, %r8 - mov (%rsi), %rax - and %r8, %rax - and $-4, %r8 - mov %rax, (CTX) - mov 8(%rsi), %rax - and %r8, %rax + mov $0x0ffffffc0fffffff, MASK + mov (KEY), %rax + and MASK, %rax + and $-4, MASK + mov %rax, P1305_R0 (CTX) + imul $5, %rax + mov %rax, P1305_S0 (CTX) C 5*R0 + mov 8(KEY), %rax + and MASK, %rax mov %rax, P1305_R1 (CTX) shr $2, %rax imul $5, %rax - mov %rax, P1305_S1 (CTX) + mov %rax, P1305_S1 (CTX) C 5*(R1>>2) xor XREG(%rax), XREG(%rax) mov %rax, P1305_H0 (CTX) mov %rax, P1305_H1 (CTX) - mov XREG(%rax), P1305_H2 (CTX) + mov %rax, P1305_H2 (CTX) W64_EXIT(2,0) ret +undefine(`KEY') +undefine(`MASK') + EPILOGUE(_nettle_poly1305_set_key) -C 64-bit multiplication mod 2^130 - 5 +define(`T0', `%rcx') +define(`T1', `%rsi') C Overlaps message input pointer. +define(`T2', `%r8') +define(`H0', `%r9') +define(`H1', `%r10') +define(`F0', `%r11') +define(`F1', `%r12') + +C First accumulate the independent products +C +C {H1,H0} = R0 T0 + S1 T1 + S0 (T2 >> 2) +C {F1,F0} = R1 T0 + R0 T1 + S1 T2 +C T = R0 * (T2 & 3) C -C (x_0 + B x_1 + B^2 x_2) * (r_0 + B r_1) = -C 1 B B^2 B^3 -C x_0 r_0 -C x_0 r_1 -C x_1 r_0 -C x_1 r_1 -C x_2 r_0 -C x_2 r_1 -C Then r_1 B^2 = r_1/4 (2^130) = 5/4 r_1. -C and r_1 B^3 = 5/4 B r_1 -C So we get +C Then add together as C -C x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0) -C 1 B B^2 B^3 -C x_0 r_0 -C x_1 r'_1 -C x_0 r_1 -C x_1 r_0 -C x_2 r'_1 -C x_2 r_0 +C +--+--+--+ +C |T |H1|H0| +C +--+--+--+ +C + |F1|F0| +C --+--+--+--+ +C |H2|H1|H0| +C +--+--+--+ C _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi) PROLOGUE(_nettle_poly1305_block) W64_ENTRY(3, 0) + push %r12 mov (%rsi), T0 mov 8(%rsi), T1 - mov XREG(%rdx), XREG(T2) - - C Registers: - C Inputs: CTX, T0, T1, T2, - C Outputs: H0, H1, H2, stored into the context. + mov XREG(%rdx), XREG(T2) C Also zero extends add P1305_H0 (CTX), T0 adc P1305_H1 (CTX), T1 - adc P1305_H2 (CTX), XREG(T2) - mov P1305_R0 (CTX), %rax - mul T0 C x0*r0 + adc P1305_H2 (CTX), T2 + + mov P1305_R1 (CTX), %rax + mul T0 C R1 T0 + mov %rax, F0 + mov %rdx, F1 + + mov T0, %rax C Last use of T0 input + mov P1305_R0 (CTX), T0 + mul T0 C R0*T0 mov %rax, H0 mov %rdx, H1 - mov P1305_S1 (CTX), %rax C 5/4 r1 - mov %rax, H2 - mul T1 C x1*r1' - imul T2, H2 C x2*r1' - imul P1305_R0 (CTX), T2 C x2*r0 + + mov T1, %rax + mul T0 C R0*T1 + add %rax, F0 + adc %rdx, F1 + + mov P1305_S1 (CTX), T0 + mov T1, %rax C Last use of T1 input + mul T0 C S1*T1 add %rax, H0 adc %rdx, H1 - mov P1305_R0 (CTX), %rax - mul T1 C x1*r0 - add %rax, H2 - adc %rdx, T2 - mov P1305_R1 (CTX), %rax - mul T0 C x0*r1 - add %rax, H2 - adc %rdx, T2 + mov T2, %rax - shr $2, %rax - imul $5, %rax - and $3, XREG(T2) + mul T0 C S1*T2 + add %rax, F0 + adc %rdx, F1 + + mov $3, XREG(T1) + and T2, T1 + + shr $2, T2 + mov P1305_S0 (CTX), %rax + mul T2 C S0*(T2 >> 2) add %rax, H0 - adc H2, H1 - adc $0, XREG(T2) + adc %rdx, H1 + + imul P1305_R0 (CTX), T1 C R0*(T2 & 3) + add F0, H1 + adc T1, F1 + mov H0, P1305_H0 (CTX) mov H1, P1305_H1 (CTX) - mov XREG(T2), P1305_H2 (CTX) + mov F1, P1305_H2 (CTX) + pop %r12 W64_EXIT(3, 0) ret EPILOGUE(_nettle_poly1305_block) +undefine(`T0') +undefine(`T1') +undefine(`T2') +undefine(`H0') +undefine(`H1') +undefine(`F0') +undefine(`F1') C _poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s) - C Registers: - C %rdi: ctx - C %rsi: s - +define(`S', `%rsi') + +define(`T0', `%rcx') +define(`T1', `%r8') +define(`H0', `%r9') +define(`H1', `%r10') +define(`F0', `%r11') +define(`F1', `%rrd') C Overlaps CTX + PROLOGUE(_nettle_poly1305_digest) W64_ENTRY(2, 0) mov P1305_H0 (CTX), H0 mov P1305_H1 (CTX), H1 - mov P1305_H2 (CTX), XREG(H2) - mov XREG(H2), XREG(%rax) - shr $2, XREG(%rax) - and $3, H2 - imul $5, XREG(%rax) - add %rax, H0 + mov P1305_H2 (CTX), F0 + + xor XREG(%rax), XREG(%rax) + mov %rax, P1305_H0 (CTX) + mov %rax, P1305_H1 (CTX) + mov %rax, P1305_H2 (CTX) + + mov $3, XREG(%rax) + and XREG(F0), XREG(%rax) + shr $2, F0 + imul $5, F0 + add F0, H0 adc $0, H1 - adc $0, XREG(H2) + adc $0, XREG(%rax) -C Use %rax instead of %rsi -define(`T1', `%rax') C Add 5, use result if >= 2^130 mov $5, T0 xor T1, T1 add H0, T0 adc H1, T1 - adc $0, XREG(H2) - cmp $4, XREG(H2) - cmovnc T0, H0 - cmovnc T1, H1 + adc $-4, XREG(%rax) C Carry if %rax + c >= 4 + cmovc T0, H0 + cmovc T1, H1 - add H0, (%rsi) - adc H1, 8(%rsi) + add H0, (S) + adc H1, 8(S) - xor XREG(%rax), XREG(%rax) - mov %rax, P1305_H0 (CTX) - mov %rax, P1305_H1 (CTX) - mov XREG(%rax), P1305_H2 (CTX) W64_EXIT(2, 0) ret EPILOGUE(_nettle_poly1305_digest) |