summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2021-01-17 22:19:23 +0100
committerTorbjorn Granlund <tg@gmplib.org>2021-01-17 22:19:23 +0100
commit1a8fa518934840fce207cc6a74256a28477beccd (patch)
treeea3003d62ef2343491422c996fb150bcacb76590 /mpn
parent9cb1d42bdcd595d5831848e29b2707f60f706973 (diff)
downloadgmp-1a8fa518934840fce207cc6a74256a28477beccd.tar.gz
Provide mpn_sbpi1_bdiv_r for bwl/skl/zen.
Diffstat (limited to 'mpn')
-rw-r--r--mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm710
1 files changed, 710 insertions, 0 deletions
diff --git a/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
new file mode 100644
index 000000000..ff3512422
--- /dev/null
+++ b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
@@ -0,0 +1,710 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zn1 ? ?
+C AMD zn2 ? ?
+C AMD zn3 ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Do overlapped software pipelining.
+C * Reduce register use, i.e., by combining n_neg and n_save.
+C * Supporess initial store through up, it's always a zero.
+C * Streamline up and dp setup.
+C * When changing this, make sure the code which falls into the inner loops
+C does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl mp_limb_t
+dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up', `%rdi')
+define(`un', `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv', `%r8')
+
+define(`n', `%rcx')
+define(`n_save', `%rbp')
+define(`dp', `%r14')
+define(`n_neg', `%rbx')
+define(`q', `%rdx')
+define(`jaddr', `%rax')
+
+define(`w0', `%r12')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+
+ lea L(atab)(%rip), %r10
+
+ cmp $MAX_SPECIAL, dn_param
+ jbe L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+
+ sub dn_param, un C outer loop count
+
+ lea -8(,dn_param,8), n_neg
+ neg n_neg
+ mov dn_param, n_save
+ mov R32(dn_param), R32(%rax)
+ shr $3, n_save C loop count
+ and $7, R32(%rax) C clear CF and OF as side-effect
+
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov (%r10,%rax,8), jaddr
+')
+ mov (up), q
+ imul dinv, q
+ jmp L(outer)
+
+L(f0): mulx( (dp), w2, w3)
+ lea -1(n), n
+ mulx( 8,(dp), w0, w1)
+ lea -8(dp), dp
+ adcx( w3, w0)
+ adox( (up), w2)
+ lea -8(up), up
+ jmp L(b0x)
+
+L(f3): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -48(up), up
+ lea 16(dp), dp
+ jmp L(b3x)
+
+L(f4): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 24(dp), dp
+ adox( (up), w2)
+ lea -40(up), up
+ adcx( w3, w0)
+ jmp L(b4x)
+
+L(f5): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 32(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -32(up), up
+ jmp L(b5x)
+
+L(f6): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 40(dp), dp
+ adox( (up), w2)
+ lea -24(up), up
+ adcx( w3, w0)
+ jmp L(b6x)
+
+L(f7): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 48(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -16(up), up
+ jmp L(b7x)
+
+L(f1): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ jmp L(b1x)
+
+L(f2): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 8(dp), dp
+ adox( (up), w2)
+ lea 8(up), up
+ adcx( w3, w0)
+ jmp L(b2x)
+
+L(end): adox( (up), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+ mov w0, (up)
+ adc %rcx, w1 C relies on rcx = 0
+ mov 8(up,n_neg), q C Compute next quotient early...
+ mulx( dinv, q, %r12) C ...(unused in last iteration)
+ bt $0, R32(%r13)
+ adc w1, 8(up)
+ setc R8(%r13)
+ dec un C clear OF as side-effect
+ jz L(done)
+
+ lea (dp,n_neg), dp C reset dp to D[]'s beginning
+ lea 8(up,n_neg), up C point up to U[]'s current beginning
+L(outer):
+ mov n_save, n
+ test %eax, %eax C clear CF and OF
+ jmp *jaddr
+
+ ALIGN(16)
+L(top): adox( -8,(up), w2)
+ adcx( w3, w0)
+ mov w2, -8(up)
+ jrcxz L(end)
+L(b2x): mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ mov w0, (up)
+L(b1x): adcx( w1, w2)
+ mulx( 16,(dp), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(up), w2)
+ mov w2, 8(up)
+L(b0x): mulx( 24,(dp), w2, w3)
+ lea 64(dp), dp
+ adcx( w1, w2)
+ adox( 16,(up), w0)
+ mov w0, 16(up)
+L(b7x): mulx( -32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+L(b6x): mulx( -24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(up), w0)
+ mov w0, 32(up)
+L(b5x): mulx( -16,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+L(b4x): adox( 48,(up), w0)
+ mulx( -8,(dp), w2, w3)
+ mov w0, 48(up)
+L(b3x): lea 64(up), up
+ adcx( w1, w2)
+ mulx( (dp), w0, w1)
+ jmp L(top)
+
+L(done):mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(sma):
+ifdef(`PIC',
+` movslq 28(%r10,dn_param,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov 56(%r10,dn_param,8), jaddr
+')
+ jmp *jaddr
+
+L(1): mov (dp_param), %r10
+ xor R32(%rax), R32(%rax)
+ mov (up), %rdx
+ dec un
+ mov %rdx, %r9
+L(o1): mulx( dinv, %rdx, %r11) C next quotient
+ lea 8(up), up
+ mulx( %r10, %rcx, %rdx) C 0 1
+ add %r9, %rcx C 0
+ adc %rax, %rdx C 1
+ add (up), %rdx C 1
+ setc R8(%rax) C 2
+ mov %rdx, %r9 C 1
+ dec un
+ jnz L(o1)
+ mov %r9, (up)
+
+ FUNC_EXIT()
+ ret
+
+ifdef(`VER',,`define(`VER',1)')
+L(2): push %r12
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ sub dn_param, un C loop count
+ mov (up), q
+ imul dinv, q
+
+ifelse(VER,0,`
+ xor R32(%rax), R32(%rax)
+L(o2): test %eax, %eax C clear CF and OF
+ mulx( (dp), w2, w3) C 0 1
+ mulx( 8,(dp), %rdx, w1) C 1 2
+ add (up), w2 C 0
+ adc 8(up), %rdx C 1
+ adc $0, w1 C 2 cannot carry further
+ add w3, %rdx C 1
+ mov %rdx, 8(up) C 1
+ adc $0, w1 C 2
+ imul dinv, q C
+ bt $0, R32(%rax)
+ adc 16(up), w1 C 2
+ mov w1, 16(up)
+ setc R8(%rax)
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+')
+ifelse(VER,1,`
+ push %rbx
+ push %r13
+ xor R32(%r13), R32(%r13)
+ mov (up), %rax
+ mov 8(up), %rbx
+L(o2): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3) C 0 1
+ mulx( 8,(dp), %rdx, w1) C 1 2
+ adox( %rax, w2) C 0
+ adcx( w3, %rdx) C 1
+ adox( %rbx, %rdx) C 1
+ adox( %rcx, w1) C 2 cannot carry further
+ mov %rdx, %rax C 1
+ adc %rcx, w1 C 2
+ imul dinv, q C
+ bt $0, R32(%r13)
+ adc 16(up), w1 C 2
+ mov w1, %rbx
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %rax, (up)
+ mov %rbx, 8(up)
+ mov %r13, %rax
+ pop %r13
+ pop %rbx
+')
+ifelse(VER,2,`
+ xor R32(%rax), R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r9
+L(o2): mulx( (dp), %r12, %r11)
+ mulx( 8,(dp), %rdx, %rcx)
+ add %r11, %rdx C 1
+ adc $0, %rcx C 2
+ add %r10, %r12 C 0 add just to produce carry
+ adc %r9, %rdx C 1
+ mov %rdx, %r10 C 1
+ mulx( dinv, %rdx, %r12) C next quotient
+ adc %rax, %rcx C 2
+ setc R8(%rax) C 3
+ mov 16(up), %r9 C 2
+ add %rcx, %r9 C 2
+ adc $0, R32(%rax) C 3
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %r10, (up)
+ mov %r9, 8(up)
+')
+ifelse(VER,3,`
+ xor R32(%rax), R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r9
+L(o2): mulx( (dp), %r12, %r11)
+ add %r10, %r12 C 0 add just to produce carry
+ mulx( 8,(dp), %rdx, %rcx)
+ adc %r11, %rdx C 1
+ adc $0, %rcx C 2
+ add %r9, %rdx C 1
+ mov %rdx, %r10 C 1
+ mulx( dinv, %rdx, %r12) C next quotient
+ adc %rax, %rcx C 2
+ setc R8(%rax) C 3
+ mov 16(up), %r9 C 2
+ add %rcx, %r9 C 2
+ adc $0, R32(%rax) C 3
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %r10, (up)
+ mov %r9, 8(up)
+')
+ pop %r14
+ pop %r12
+ FUNC_EXIT()
+ ret
+
+ifelse(eval(MAX_SPECIAL>=3),1,`
+L(3): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF
+ mulx( (dp), w0, w1) C 0 1
+ adox( %rax, w0) C 0
+ mulx( 8,(dp), %rax, w3) C 1 2
+ adcx( w1, %rax) C 1
+ adox( %rbx, %rax) C 1
+ mulx( 16,(dp), %rbx, w1) C 2 3
+ mov dinv, q C 1
+ mulx( %rax, q, w0)
+ adcx( w3, %rbx) C 2
+ adox( 16,(up), %rbx) C 2
+ adox( %rcx, w1) C 3
+ adc $0, w1 C 3
+ bt $0, R32(%r13)
+ adc w1, 24(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o3)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=4),1,`
+L(4): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o4): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ mov dinv, q
+ mulx( %rax, q, w2)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ adox( 24,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 24(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 32(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o4)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=5),1,`
+L(5): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o5): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w0, w1)
+ adox( %rax, w0)
+ mulx( 8,(dp), %rax, w3)
+ adcx( w1, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w1)
+ adcx( w3, %rbx)
+ adox( 16,(up), %rbx)
+ mulx( 24,(dp), w2, w3)
+ adcx( w1, w2)
+ mulx( 32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 24(up)
+ adox( 32,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 32(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 40(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o5)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=6),1,`
+L(6): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o6): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ adox( 24,(up), w0)
+ mulx( 32,(dp), w2, w3)
+ mov w0, 24(up)
+ adcx( w1, w2)
+ mulx( 40,(dp), w0, w1)
+ adox( 32,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 32(up)
+ adox( 40,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 40(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 48(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o6)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=7),1,`
+L(7): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp
+ xor %r13, %r13
+ sub dn_param, un
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o7): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w0, w1)
+ adox( %rax, w0)
+ mulx( 8,(dp), %rax, w3)
+ adcx( w1, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w1)
+ adcx( w3, %rbx)
+ mulx( 24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 16,(up), %rbx)
+ mulx( 32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+ adox( 32,(up), w0)
+ mulx( 40,(dp), w2, w3)
+ mov w0, 32(up)
+ adcx( w1, w2)
+ mulx( 48,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+ mov %rax, q
+ mulx( dinv, q, w2)
+ adox( 48,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 48(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 56(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o7)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=8),1,`
+L(8): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp
+ xor %r13, %r13
+ sub dn_param, un
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o8): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ mulx( 32,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 24,(up), w0)
+ mov w0, 24(up)
+ mulx( 40,(dp), w0, w1)
+ adox( 32,(up), w2)
+ adcx( w3, w0)
+ mov w2, 32(up)
+ adox( 40,(up), w0)
+ mulx( 48,(dp), w2, w3)
+ mov w0, 40(up)
+ adcx( w1, w2)
+ mulx( 56,(dp), w0, w1)
+ adox( 48,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 48(up)
+ adox( 56,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 56(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 64(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o8)
+ jmp L(esma)
+')
+
+L(esma):mov %rax, (up)
+ mov %rbx, 8(up)
+ mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ JUMPTABSECT
+ ALIGN(8)
+L(atab):JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ JMPENT( L(1), L(atab))
+ JMPENT( L(2), L(atab))
+ JMPENT( L(3), L(atab))
+ JMPENT( L(4), L(atab))
+ JMPENT( L(5), L(atab))
+ JMPENT( L(6), L(atab))
+ JMPENT( L(7), L(atab))
+ JMPENT( L(8), L(atab))
+ TEXT
+EPILOGUE()