From 1a8fa518934840fce207cc6a74256a28477beccd Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Sun, 17 Jan 2021 22:19:23 +0100 Subject: Provide mpn_sbpi1_bdiv_r for bwl/skl/zen. --- mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm | 710 +++++++++++++++++++++++++++++++++++ 1 file changed, 710 insertions(+) create mode 100644 mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm (limited to 'mpn') diff --git a/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm new file mode 100644 index 000000000..ff3512422 --- /dev/null +++ b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm @@ -0,0 +1,710 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell. + +dnl Copyright 2015, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zn1 ? ? +C AMD zn2 ? ? +C AMD zn3 ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * Reduce register use, i.e., by combining n_neg and n_save. +C * Supporess initial store through up, it's always a zero. +C * Streamline up and dp setup. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +dnl mp_limb_t +dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, +dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) + +define(`up', `%rdi') +define(`un', `%rsi') +define(`dp_param',`%rdx') +define(`dn_param',`%rcx') +define(`dinv', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`dp', `%r14') +define(`n_neg', `%rbx') +define(`q', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ifdef(`MAX_SPECIAL',,` +define(`MAX_SPECIAL', 8)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + lea L(atab)(%rip), %r10 + + cmp $MAX_SPECIAL, dn_param + jbe L(sma) + +ifelse(MAX_SPECIAL,8,,` +forloop(i,eval(MAX_SPECIAL+1),9,`L(i): +')') + +L(gen): push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + + sub dn_param, un C outer loop count + + lea -8(,dn_param,8), n_neg + neg n_neg + mov dn_param, n_save + mov R32(dn_param), R32(%rax) + shr $3, n_save C loop count + and $7, R32(%rax) C clear CF and OF as side-effect + +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax,%r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + mov (up), q + imul dinv, q + jmp L(outer) + +L(f0): mulx( (dp), w2, w3) + lea -1(n), n + mulx( 8,(dp), w0, w1) + lea -8(dp), dp + adcx( w3, w0) + adox( (up), w2) + lea -8(up), up + jmp L(b0x) + +L(f3): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -48(up), up + lea 16(dp), dp + jmp L(b3x) + +L(f4): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 24(dp), dp + adox( (up), w2) + lea -40(up), up + adcx( w3, w0) + jmp L(b4x) + +L(f5): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 32(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -32(up), up + jmp L(b5x) + +L(f6): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 40(dp), dp + adox( (up), w2) + lea -24(up), up + adcx( w3, w0) + jmp L(b6x) + +L(f7): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 48(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -16(up), up + jmp L(b7x) + +L(f1): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + jmp L(b1x) + +L(f2): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 8(dp), dp + adox( (up), w2) + lea 8(up), up + adcx( w3, w0) + jmp L(b2x) + +L(end): adox( (up), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (up) + adc %rcx, w1 C relies on rcx = 0 + mov 8(up,n_neg), q C Compute next quotient early... + mulx( dinv, q, %r12) C ...(unused in last iteration) + bt $0, R32(%r13) + adc w1, 8(up) + setc R8(%r13) + dec un C clear OF as side-effect + jz L(done) + + lea (dp,n_neg), dp C reset dp to D[]'s beginning + lea 8(up,n_neg), up C point up to U[]'s current beginning +L(outer): + mov n_save, n + test %eax, %eax C clear CF and OF + jmp *jaddr + + ALIGN(16) +L(top): adox( -8,(up), w2) + adcx( w3, w0) + mov w2, -8(up) + jrcxz L(end) +L(b2x): mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + mov w0, (up) +L(b1x): adcx( w1, w2) + mulx( 16,(dp), w0, w1) + adcx( w3, w0) + adox( 8,(up), w2) + mov w2, 8(up) +L(b0x): mulx( 24,(dp), w2, w3) + lea 64(dp), dp + adcx( w1, w2) + adox( 16,(up), w0) + mov w0, 16(up) +L(b7x): mulx( -32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) +L(b6x): mulx( -24,(dp), w2, w3) + adcx( w1, w2) + adox( 32,(up), w0) + mov w0, 32(up) +L(b5x): mulx( -16,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) +L(b4x): adox( 48,(up), w0) + mulx( -8,(dp), w2, w3) + mov w0, 48(up) +L(b3x): lea 64(up), up + adcx( w1, w2) + mulx( (dp), w0, w1) + jmp L(top) + +L(done):mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(sma): +ifdef(`PIC', +` movslq 28(%r10,dn_param,4), %rax + lea (%rax,%r10), jaddr +',` + mov 56(%r10,dn_param,8), jaddr +') + jmp *jaddr + +L(1): mov (dp_param), %r10 + xor R32(%rax), R32(%rax) + mov (up), %rdx + dec un + mov %rdx, %r9 +L(o1): mulx( dinv, %rdx, %r11) C next quotient + lea 8(up), up + mulx( %r10, %rcx, %rdx) C 0 1 + add %r9, %rcx C 0 + adc %rax, %rdx C 1 + add (up), %rdx C 1 + setc R8(%rax) C 2 + mov %rdx, %r9 C 1 + dec un + jnz L(o1) + mov %r9, (up) + + FUNC_EXIT() + ret + +ifdef(`VER',,`define(`VER',1)') +L(2): push %r12 + push %r14 + + mov dp_param, dp C free up rdx + sub dn_param, un C loop count + mov (up), q + imul dinv, q + +ifelse(VER,0,` + xor R32(%rax), R32(%rax) +L(o2): test %eax, %eax C clear CF and OF + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + add (up), w2 C 0 + adc 8(up), %rdx C 1 + adc $0, w1 C 2 cannot carry further + add w3, %rdx C 1 + mov %rdx, 8(up) C 1 + adc $0, w1 C 2 + imul dinv, q C + bt $0, R32(%rax) + adc 16(up), w1 C 2 + mov w1, 16(up) + setc R8(%rax) + lea 8(up), up + dec un + jnz L(o2) +') +ifelse(VER,1,` + push %rbx + push %r13 + xor R32(%r13), R32(%r13) + mov (up), %rax + mov 8(up), %rbx +L(o2): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + adox( %rax, w2) C 0 + adcx( w3, %rdx) C 1 + adox( %rbx, %rdx) C 1 + adox( %rcx, w1) C 2 cannot carry further + mov %rdx, %rax C 1 + adc %rcx, w1 C 2 + imul dinv, q C + bt $0, R32(%r13) + adc 16(up), w1 C 2 + mov w1, %rbx + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o2) + + mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r13 + pop %rbx +') +ifelse(VER,2,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + mulx( 8,(dp), %rdx, %rcx) + add %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r10, %r12 C 0 add just to produce carry + adc %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') +ifelse(VER,3,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + add %r10, %r12 C 0 add just to produce carry + mulx( 8,(dp), %rdx, %rcx) + adc %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') + pop %r14 + pop %r12 + FUNC_EXIT() + ret + +ifelse(eval(MAX_SPECIAL>=3),1,` +L(3): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF + mulx( (dp), w0, w1) C 0 1 + adox( %rax, w0) C 0 + mulx( 8,(dp), %rax, w3) C 1 2 + adcx( w1, %rax) C 1 + adox( %rbx, %rax) C 1 + mulx( 16,(dp), %rbx, w1) C 2 3 + mov dinv, q C 1 + mulx( %rax, q, w0) + adcx( w3, %rbx) C 2 + adox( 16,(up), %rbx) C 2 + adox( %rcx, w1) C 3 + adc $0, w1 C 3 + bt $0, R32(%r13) + adc w1, 24(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o3) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=4),1,` +L(4): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o4): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + mov dinv, q + mulx( %rax, q, w2) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + adox( %rcx, w1) + mov w0, 24(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 32(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o4) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=5),1,` +L(5): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o5): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + adox( 16,(up), %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 24(up) + adox( 32,(up), w0) + adox( %rcx, w1) + mov w0, 32(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 40(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o5) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=6),1,` +L(6): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o6): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + mulx( 32,(dp), w2, w3) + mov w0, 24(up) + adcx( w1, w2) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 32(up) + adox( 40,(up), w0) + adox( %rcx, w1) + mov w0, 40(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 48(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o6) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=7),1,` +L(7): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o7): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + adox( 16,(up), %rbx) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) + adox( 32,(up), w0) + mulx( 40,(dp), w2, w3) + mov w0, 32(up) + adcx( w1, w2) + mulx( 48,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) + mov %rax, q + mulx( dinv, q, w2) + adox( 48,(up), w0) + adox( %rcx, w1) + mov w0, 48(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 56(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o7) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=8),1,` +L(8): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o8): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + mulx( 32,(dp), w2, w3) + adcx( w1, w2) + adox( 24,(up), w0) + mov w0, 24(up) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov w2, 32(up) + adox( 40,(up), w0) + mulx( 48,(dp), w2, w3) + mov w0, 40(up) + adcx( w1, w2) + mulx( 56,(dp), w0, w1) + adox( 48,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 48(up) + adox( 56,(up), w0) + adox( %rcx, w1) + mov w0, 56(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 64(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o8) + jmp L(esma) +') + +L(esma):mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret + + + JUMPTABSECT + ALIGN(8) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(1), L(atab)) + JMPENT( L(2), L(atab)) + JMPENT( L(3), L(atab)) + JMPENT( L(4), L(atab)) + JMPENT( L(5), L(atab)) + JMPENT( L(6), L(atab)) + JMPENT( L(7), L(atab)) + JMPENT( L(8), L(atab)) + TEXT +EPILOGUE() -- cgit v1.2.1