diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-04-16 21:45:13 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-04-23 21:19:09 +0300 |
commit | ad4ee8d52f7199ba8bdee767044337060529069f (patch) | |
tree | 80321ad5be8f2fb47452c4e748f02829235ec6cd /mpi/amd64 | |
parent | 3e17e819a6a4d505828cf93fc2c258a753f1d38c (diff) | |
download | libgcrypt-ad4ee8d52f7199ba8bdee767044337060529069f.tar.gz |
mpi/amd64: optimize add_n and sub_n
* mpi/amd64/mpih-add1.S (_gcry_mpih_add_n): New implementation
with 4x unrolled fast-path loop.
* mpi/amd64/mpih-sub1.S (_gcry_mpih_sub_n): Likewise.
--
Benchmark on AMD Ryzen 9 7900X:
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
add | 0.035 ns/B 27559 MiB/s 0.163 c/B 4700
sub | 0.034 ns/B 28332 MiB/s 0.158 c/B 4700
After (~26% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
add | 0.027 ns/B 35271 MiB/s 0.127 c/B 4700
sub | 0.027 ns/B 35206 MiB/s 0.127 c/B 4700
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'mpi/amd64')
-rw-r--r-- | mpi/amd64/mpih-add1.S | 81 | ||||
-rw-r--r-- | mpi/amd64/mpih-sub1.S | 80 |
2 files changed, 136 insertions, 25 deletions
diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S index 833a43cb..f2e86237 100644 --- a/mpi/amd64/mpih-add1.S +++ b/mpi/amd64/mpih-add1.S @@ -3,6 +3,7 @@ * * Copyright (C) 1992, 1994, 1995, 1998, * 2001, 2002, 2006 Free Software Foundation, Inc. + * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -39,26 +40,80 @@ * mpi_ptr_t s2_ptr, rdx * mpi_size_t size) rcx */ - TEXT ALIGN(4) .globl C_SYMBOL_NAME(_gcry_mpih_add_n) C_SYMBOL_NAME(_gcry_mpih_add_n:) FUNC_ENTRY() - leaq (%rsi,%rcx,8), %rsi - leaq (%rdi,%rcx,8), %rdi - leaq (%rdx,%rcx,8), %rdx - negq %rcx - xorl %eax, %eax /* clear cy */ + movl %ecx, %r9d + andl $3, %r9d + je .Lprehandle0 + cmpl $2, %r9d + jb .Lprehandle1 + je .Lprehandle2 + +#define FIRST_ADD() \ + movq (%rsi), %rax; \ + addq (%rdx), %rax; \ + movq %rax, (%rdi) + +#define NEXT_ADD(offset) \ + movq offset(%rsi), %rax; \ + adcq offset(%rdx), %rax; \ + movq %rax, offset(%rdi) + +.Lprehandle3: + leaq -2(%rcx), %rcx + FIRST_ADD(); + NEXT_ADD(8); + NEXT_ADD(16); + decq %rcx + je .Lend + leaq 24(%rsi), %rsi + leaq 24(%rdx), %rdx + leaq 24(%rdi), %rdi + jmp .Loop + + ALIGN(3) +.Lprehandle2: + leaq -1(%rcx), %rcx + FIRST_ADD(); + NEXT_ADD(8); + decq %rcx + je .Lend + leaq 16(%rsi), %rsi + leaq 16(%rdx), %rdx + leaq 16(%rdi), %rdi + jmp .Loop + + ALIGN(3) +.Lprehandle1: + FIRST_ADD(); + decq %rcx + je .Lend + leaq 8(%rsi), %rsi + leaq 8(%rdx), %rdx + leaq 8(%rdi), %rdi + jmp .Loop + + ALIGN(3) +.Lprehandle0: + clc /* clear cy */ ALIGN(4) /* minimal alignment for claimed speed */ -.Loop: movq (%rsi,%rcx,8), %rax - movq (%rdx,%rcx,8), %r10 - adcq %r10, %rax - movq %rax, (%rdi,%rcx,8) - incq %rcx +.Loop: leaq -3(%rcx), %rcx + NEXT_ADD(0); + NEXT_ADD(8); + NEXT_ADD(16); + NEXT_ADD(24); + leaq 32(%rsi), %rsi + leaq 32(%rdx), %rdx + leaq 32(%rdi), %rdi + decq %rcx jne .Loop - movq %rcx, %rax /* zero %rax */ - adcq %rax, %rax + ALIGN(2) +.Lend: + movl $0, %eax /* zero %rax */ + adcl %eax, %eax FUNC_EXIT() diff --git a/mpi/amd64/mpih-sub1.S b/mpi/amd64/mpih-sub1.S index 8c61cb20..32799c86 100644 --- a/mpi/amd64/mpih-sub1.S +++ b/mpi/amd64/mpih-sub1.S @@ -3,6 +3,7 @@ * * Copyright (C) 1992, 1994, 1995, 1998, * 2001, 2002, 2006 Free Software Foundation, Inc. + * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -44,20 +45,75 @@ .globl C_SYMBOL_NAME(_gcry_mpih_sub_n) C_SYMBOL_NAME(_gcry_mpih_sub_n:) FUNC_ENTRY() - leaq (%rsi,%rcx,8), %rsi - leaq (%rdi,%rcx,8), %rdi - leaq (%rdx,%rcx,8), %rdx - negq %rcx - xorl %eax, %eax /* clear cy */ + movl %ecx, %r9d + andl $3, %r9d + je .Lprehandle0 + cmpl $2, %r9d + jb .Lprehandle1 + je .Lprehandle2 + +#define FIRST_SUB() \ + movq (%rsi), %rax; \ + subq (%rdx), %rax; \ + movq %rax, (%rdi) + +#define NEXT_SUB(offset) \ + movq offset(%rsi), %rax; \ + sbbq offset(%rdx), %rax; \ + movq %rax, offset(%rdi) + +.Lprehandle3: + leaq -2(%rcx), %rcx + FIRST_SUB(); + NEXT_SUB(8); + NEXT_SUB(16); + decq %rcx + je .Lend + leaq 24(%rsi), %rsi + leaq 24(%rdx), %rdx + leaq 24(%rdi), %rdi + jmp .Loop + + ALIGN(3) +.Lprehandle2: + leaq -1(%rcx), %rcx + FIRST_SUB(); + NEXT_SUB(8); + decq %rcx + je .Lend + leaq 16(%rsi), %rsi + leaq 16(%rdx), %rdx + leaq 16(%rdi), %rdi + jmp .Loop + + ALIGN(3) +.Lprehandle1: + FIRST_SUB(); + decq %rcx + je .Lend + leaq 8(%rsi), %rsi + leaq 8(%rdx), %rdx + leaq 8(%rdi), %rdi + jmp .Loop + + ALIGN(3) +.Lprehandle0: + clc /* clear cy */ ALIGN(4) /* minimal alignment for claimed speed */ -.Loop: movq (%rsi,%rcx,8), %rax - movq (%rdx,%rcx,8), %r10 - sbbq %r10, %rax - movq %rax, (%rdi,%rcx,8) - incq %rcx +.Loop: leaq -3(%rcx), %rcx + NEXT_SUB(0); + NEXT_SUB(8); + NEXT_SUB(16); + NEXT_SUB(24); + leaq 32(%rsi), %rsi + leaq 32(%rdx), %rdx + leaq 32(%rdi), %rdi + decq %rcx jne .Loop - movq %rcx, %rax /* zero %rax */ - adcq %rax, %rax + ALIGN(2) +.Lend: + movl $0, %eax /* zero %rax */ + adcl %eax, %eax FUNC_EXIT() |