diff options
author | Torbjorn Granlund <tg@gmplib.org> | 2019-11-29 00:53:11 +0100 |
---|---|---|
committer | Torbjorn Granlund <tg@gmplib.org> | 2019-11-29 00:53:11 +0100 |
commit | 175570b422747d6a7e7ec141e5c04197e0c7d876 (patch) | |
tree | 47c2f0d2c00835224d3d0abfa636e6b8806689e5 | |
parent | 3c9499b33207fab15aa61f90c8537f4a1514a416 (diff) | |
download | gmp-175570b422747d6a7e7ec141e5c04197e0c7d876.tar.gz |
Remove all gcd_1.asm files.
-rw-r--r-- | mpn/alpha/ev67/gcd_1.asm | 145 | ||||
-rw-r--r-- | mpn/arm/v5/gcd_1.asm | 123 | ||||
-rw-r--r-- | mpn/arm/v6t2/gcd_1.asm | 118 | ||||
-rw-r--r-- | mpn/arm64/gcd_1.asm | 125 | ||||
-rw-r--r-- | mpn/ia64/gcd_1.asm | 238 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/gcd_1.asm | 125 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/p7/gcd_1.asm | 110 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/p9/gcd_1.asm | 101 | ||||
-rw-r--r-- | mpn/sparc64/gcd_1.asm | 135 | ||||
-rw-r--r-- | mpn/x86/k6/gcd_1.asm | 359 | ||||
-rw-r--r-- | mpn/x86/k7/gcd_1.asm | 193 | ||||
-rw-r--r-- | mpn/x86/p6/gcd_1.asm | 161 | ||||
-rw-r--r-- | mpn/x86_64/bd1/gcd_1.asm | 37 | ||||
-rw-r--r-- | mpn/x86_64/bd2/gcd_1.asm | 164 | ||||
-rw-r--r-- | mpn/x86_64/bt2/gcd_1.asm | 37 | ||||
-rw-r--r-- | mpn/x86_64/core2/gcd_1.asm | 151 | ||||
-rw-r--r-- | mpn/x86_64/gcd_1.asm | 170 | ||||
-rw-r--r-- | mpn/x86_64/k10/gcd_1.asm | 37 | ||||
-rw-r--r-- | mpn/x86_64/nano/gcd_1.asm | 37 | ||||
-rw-r--r-- | mpn/x86_64/zen/gcd_1.asm | 37 |
20 files changed, 0 insertions, 2603 deletions
diff --git a/mpn/alpha/ev67/gcd_1.asm b/mpn/alpha/ev67/gcd_1.asm deleted file mode 100644 index 55fa7d367..000000000 --- a/mpn/alpha/ev67/gcd_1.asm +++ /dev/null @@ -1,145 +0,0 @@ -dnl Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor. - -dnl Copyright 2003, 2004 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C ev67: 3.4 cycles/bitpair for 1x1 part - - -C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); -C -C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and -C strip trailing zeros from abs(x-y) to maintain x and y both odd. -C -C The trailing zeros are calculated from just x-y, since in twos-complement -C there's the same number of trailing zeros on d or -d. This means the cttz -C runs in parallel with abs(x-y). -C -C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit -C operands with this algorithm gives the measured 3.4 c/l. -C -C The slottings shown are for SVR4 style systems, Unicos differs in the -C initial gp setup and the LEA. -C -C Enhancement: -C -C On the jsr, !lituse_jsr! (when available) would allow the linker to relax -C it to a bsr, but probably only in a static binary. Plain "jsr foo" gives -C the right object code for relaxation, and ought to be available -C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for -C the usual case of running in a shared library. -C -C bsr could perhaps be used explicitly anyway. We should be able to assume -C modexact is in the same module as us (ie. shared library or mainline). -C Would there be any worries about the size of the displacement? Could -C always put modexact and gcd_1 in the same .o to be certain. - -ASM_START() -PROLOGUE(mpn_gcd_1, gp) - - C r16 xp - C r17 size - C r18 y - - C ldah C l - C lda C u - - ldq r0, 0(r16) C L x = xp[0] - lda r30, -32(r30) C u alloc stack - - LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldq (gp) - stq r10, 16(r30) C L save r10 - cttz r18, r10 C U0 y twos - cmpeq r17, 1, r5 C u test size==1 - - stq r9, 8(r30) C L save r9 - clr r19 C u zero c for modexact - unop - unop - - cttz r0, r6 C U0 x twos - stq r26, 0(r30) C L save ra - - srl r18, r10, r18 C U y odd - - mov r18, r9 C l hold y across call - - cmpult r6, r10, r2 C u test x_twos < y_twos - - cmovne r2, r6, r10 C l common_twos = min(x_twos,y_twos) - bne r5, L(one) C U no modexact if size==1 - jsr r26, (r27), mpn_modexact_1c_odd C L0 - - LDGP( r29, 0(r26)) C u,l ldah,lda - cttz r0, r6 C U0 new x twos - ldq r26, 0(r30) C L restore ra - -L(one): - mov r9, r1 C u y - ldq r9, 8(r30) C L restore r9 - mov r10, r2 C u common twos - ldq r10, 16(r30) C L restore r10 - - lda r30, 32(r30) C l free stack - beq r0, L(done) C U return y if x%y==0 - - srl r0, r6, r0 C U x odd - unop - - ALIGN(16) -L(top): - C r0 x - C r1 y - C r2 common twos, for use at end - - subq r0, r1, r7 C l0 d = x - y - cmpult r0, r1, r16 C u0 test x >= y - - subq r1, r0, r4 C l0 new_x = y - x - cttz r7, r8 C U0 d twos - - cmoveq r16, r7, r4 C l0 new_x = d if x>=y - cmovne r16, r0, r1 C u0 y = x if x<y - unop C l \ force cmoveq into l0 - unop C u / - - C C cmoveq2 L0, cmovne2 U0 - - srl r4, r8, r0 C U0 x = new_x >> twos - bne r7, L(top) C U1 stop when d==0 - - -L(done): - sll r1, r2, r0 C U0 return y << common_twos - ret r31, (r26), 1 C L0 - -EPILOGUE() -ASM_END() diff --git a/mpn/arm/v5/gcd_1.asm b/mpn/arm/v5/gcd_1.asm deleted file mode 100644 index 7897544b9..000000000 --- a/mpn/arm/v5/gcd_1.asm +++ /dev/null @@ -1,123 +0,0 @@ -dnl ARM v5 mpn_gcd_1. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/bit (approx) -C StrongARM - -C XScale ? -C Cortex-A5 6.45 -C Cortex-A7 6.41 -C Cortex-A8 5.0 -C Cortex-A9 5.9 -C Cortex-A15 4.40 -C Cortex-A17 5.68 -C Cortex-A53 4.37 -C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 - -C TODO -C * Optimise inner-loop better. - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 6) - -C INPUT PARAMETERS -define(`up', `r0') -define(`n', `r1') -define(`v0', `r2') - -ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',, - `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - push {r4, r7, lr} - ldr r3, [up] C U low limb - - orr r3, r3, v0 - rsb r4, r3, #0 - and r4, r4, r3 - clz r4, r4 C min(ctz(u0),ctz(v0)) - rsb r4, r4, #31 - - rsb r12, v0, #0 - and r12, r12, v0 - clz r12, r12 - rsb r12, r12, #31 - mov v0, v0, lsr r12 - - mov r7, v0 - - cmp n, #1 - bne L(nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - ldr r3, [up] - cmp v0, r3, lsr #BMOD_THRES_LOG2 - bhi L(red1) - -L(bmod):mov r3, #0 C carry argument - bl mpn_modexact_1c_odd - b L(red0) - -L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD - blo L(bmod) - - bl mpn_mod_1 - -L(red0):mov r3, r0 -L(red1):rsbs r12, r3, #0 - and r12, r12, r3 - clz r12, r12 - rsb r12, r12, #31 - bne L(mid) - b L(end) - - ALIGN(8) -L(top): rsb r12, r12, #31 - movcc r3, r1 C if x-y < 0 - movcc r7, r0 C use x,y-x -L(mid): mov r3, r3, lsr r12 C - mov r0, r3 C - sub r1, r7, r3 C - rsbs r3, r7, r3 C - and r12, r1, r3 C - clz r12, r12 C - bne L(top) C - -L(end): mov r0, r7, lsl r4 - pop {r4, r7, pc} -EPILOGUE() diff --git a/mpn/arm/v6t2/gcd_1.asm b/mpn/arm/v6t2/gcd_1.asm deleted file mode 100644 index 120a955f3..000000000 --- a/mpn/arm/v6t2/gcd_1.asm +++ /dev/null @@ -1,118 +0,0 @@ -dnl ARM v6t2 mpn_gcd_1. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/bit (approx) -C StrongARM - -C XScale - -C Cortex-A5 5.75 -C Cortex-A7 6.38 -C Cortex-A8 5.0 -C Cortex-A9 5.3 -C Cortex-A15 2.92 -C Cortex-A17 5.63 -C Cortex-A53 4.25 -C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 - -C TODO -C * Optimise inner-loop better. -C * Push saving/restoring of callee-user regs into call code - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 7) - -C INPUT PARAMETERS -define(`up', `r0') -define(`n', `r1') -define(`v0', `r2') - -ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',, - `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - push {r4, r7, lr} - ldr r3, [up] C U low limb - - orr r3, r3, v0 - rbit r4, r3 - clz r4, r4 C min(ctz(u0),ctz(v0)) - - rbit r12, v0 - clz r12, r12 - mov v0, v0, lsr r12 - - mov r7, v0 - - cmp n, #1 - bne L(nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - ldr r3, [up] - cmp v0, r3, lsr #BMOD_THRES_LOG2 - bhi L(red1) - -L(bmod):mov r3, #0 C carry argument - bl mpn_modexact_1c_odd - b L(red0) - -L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD - blo L(bmod) - - bl mpn_mod_1 - -L(red0):mov r3, r0 -L(red1):cmp r3, #0 - rbit r12, r3 - clz r12, r12 - bne L(mid) - b L(end) - - ALIGN(8) -L(top): movcs r3, r1 C if x-y < 0 - movcs r7, r0 C use x,y-x -L(mid): mov r3, r3, lsr r12 C - mov r0, r3 C - subs r1, r7, r3 C - rsb r3, r7, r3 C - rbit r12, r1 - clz r12, r12 C - bne L(top) C - -L(end): mov r0, r7, lsl r4 - pop {r4, r7, pc} -EPILOGUE() diff --git a/mpn/arm64/gcd_1.asm b/mpn/arm64/gcd_1.asm deleted file mode 100644 index 55513b31d..000000000 --- a/mpn/arm64/gcd_1.asm +++ /dev/null @@ -1,125 +0,0 @@ -dnl ARM v8a mpn_gcd_1. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -changecom(blah) - -C cycles/bit (approx) -C Cortex-A53 ? -C Cortex-A57 ? - -C TODO -C * Optimise inner-loop better. -C * Push saving/restoring of callee-user regs into call code - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 7) - -C INPUT PARAMETERS -define(`up', `x0') -define(`n', `x1') -define(`v0', `x2') - -ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',, - `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - stp x29, x30, [sp,#-32]! - ldr x3, [up] C U low limb - stp x19, x20, [sp,#16] - - orr x3, x3, v0 - rbit x4, x3 - clz x20, x4 C min(ctz(u0),ctz(v0)) - - rbit x12, v0 - clz x12, x12 - lsr v0, v0, x12 - - mov x19, v0 - - cmp n, #1 - b.ne L(nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - ldr x3, [up] - cmp v0, x3, lsr #BMOD_THRES_LOG2 - b.hi L(red1) - -L(bmod):mov x3, #0 C carry argument - bl mpn_modexact_1c_odd - b L(red0) - -L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD - b.lo L(bmod) - - bl mpn_mod_1 - -L(red0):mov x3, x0 -L(red1):cmp x3, #0 - rbit x12, x3 - clz x12, x12 - b.ne L(mid) - b L(end) - - ALIGN(8) -L(top): -ifelse(1,1,` -C This shorter variant makes full use of armv8 insns - csneg x3, x1, x1, cs C if x-y < 0 - csel x19, x4, x19, cs C use x,y-x -L(mid): lsr x4, x3, x12 C - subs x1, x19, x4 C -',` -C This variant is akin to the 32-bit v6t2 code - csel x3, x1, x3, cs C if x-y < 0 - csel x19, x0, x19, cs C use x,y-x -L(mid): lsr x3, x3, x12 C - mov x0, x3 C - subs x1, x19, x3 C - sub x3, x3, x19 C -') - rbit x12, x1 - clz x12, x12 C - b.ne L(top) C - -L(end): lsl x0, x19, x20 - ldp x19, x20, [sp,#16] - ldp x29, x30, [sp],#32 - ret -EPILOGUE() diff --git a/mpn/ia64/gcd_1.asm b/mpn/ia64/gcd_1.asm deleted file mode 100644 index 3afabd706..000000000 --- a/mpn/ia64/gcd_1.asm +++ /dev/null @@ -1,238 +0,0 @@ -dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd. - -dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn -dnl Granlund. - -dnl Copyright 2002-2005, 2012, 2013, 2015 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bitpair (1x1 gcd) -C Itanium: ? -C Itanium 2: 5.1 - - -C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); -C -C The entry sequence is designed to expect xsize>1 and hence a modexact -C call. This ought to be more common than a 1x1 operation. Our critical -C path is thus stripping factors of 2 from y, calling modexact, then -C stripping factors of 2 from the x remainder returned. -C -C The common factors of 2 between x and y must be determined using the -C original x, not the remainder from the modexact. This is done with -C x_orig which is xp[0]. There's plenty of time to do this while the rest -C of the modexact etc is happening. -C -C It's possible xp[0] is zero. In this case the trailing zeros calculation -C popc((x-1)&~x) gives 63, and that's clearly no less than what y will -C have, making min(x_twos,y_twos) == y_twos. -C -C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then -C stripping factors of 2 from abs(x-y). Those factors of two are -C determined from just y-x, without the abs(), since there's the same -C number of trailing zeros on n or -n in twos complement. That makes the -C dependent chain 8 cycles deep. -C -C The selection of x-y versus y-x for abs(x-y), and the selection of the -C minimum of x and y, is done in parallel with the critical path. -C -C The algorithm takes about 0.68 iterations per bit (two N bit operands) on -C average, hence the final 5.8 cycles/bitpair. -C -C Not done: -C -C An alternate algorithm which didn't strip all twos, but instead applied -C tbit and predicated extr on x, and then y, was attempted. The loop was 6 -C cycles, but the algorithm is an average 1.25 iterations per bitpair for a -C total 7.25 c/bp, which is slower than the current approach. -C -C Alternatives: -C -C Perhaps we could do something tricky by extracting a few high bits and a -C few low bits from the operands, and looking up a table which would give a -C set of predicates to control some shifts or subtracts or whatever. That -C could knock off multiple bits per iteration. -C -C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr -C only going down I0), perhaps it'd be possible to shift left instead, -C using add. That would mean keeping track of the lowest not-yet-zeroed -C bit, using some sort of mask. -C -C TODO: -C * Once mod_1_N exists in assembly for Itanium, add conditional calls. -C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls). -C * Probably avoid popcnt also outside of loop, instead use ctz_table. - -ASM_START() - .explicit C What does this mean? - -C HP's assembler requires these declarations for importing mpn_modexact_1c_odd - .global mpn_modexact_1c_odd - .type mpn_modexact_1c_odd,@function - -C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. - -deflit(MAXSHIFT, 7) -deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) - -C .section ".rodata" - .rodata - ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep -ctz_table: - data1 MAXSHIFT -forloop(i,1,MASK, -` data1 m4_count_trailing_zeros(i) -') - -PROLOGUE(mpn_gcd_1) - - C r32 xp - C r33 xsize - C r34 y - -define(x, r8) -define(xp_orig, r32) -define(xsize, r33) -define(y, r34) define(inputs, 3) -define(save_rp, r35) -define(save_pfs, r36) -define(x_orig, r37) -define(x_orig_one, r38) -define(y_twos, r39) define(locals, 5) -define(out_xp, r40) -define(out_xsize, r41) -define(out_divisor, r42) -define(out_carry, r43) define(outputs, 4) - - .prologue - {.mmi; -ifdef(`HAVE_ABI_32', -` addp4 r9 = 0, xp_orig define(xp,r9)', C M0 -` define(xp,xp_orig)') - .save ar.pfs, save_pfs - alloc save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2 - .save rp, save_rp - mov save_rp = b0 C I0 -}{.mbb; .body - add r10 = -1, y C M3 y-1 - nop.b 0 C B0 - nop.b 0 C B1 - ;; - -}{.mmi; ld8 x = [xp] C M0 x = xp[0] if no modexact - ld8 x_orig = [xp] C M1 orig x for common twos - cmp.ne p6,p0 = 1, xsize C I0 -}{.mmi; andcm y_twos = r10, y C M2 (y-1)&~y - mov out_xp = xp_orig C M3 - mov out_xsize = xsize C I1 - ;; -}{.mmi; mov out_carry = 0 C M0 - nop.m 0 C M1 - popcnt y_twos = y_twos C I0 y twos - ;; -}{.mmi; add x_orig_one = -1, x_orig C M0 orig x-1 - nop.m 0 C M1 - shr.u out_divisor = y, y_twos C I0 y without twos -}{.mib; nop.m 0 C M2 - shr.u y = y, y_twos C I1 y without twos - (p6) br.call.sptk.many b0 = mpn_modexact_1c_odd C if xsize>1 - ;; -} - C modexact can leave x==0 - {.mmi; cmp.eq p6,p0 = 0, x C M0 if {xp,xsize} % y == 0 - andcm x_orig = x_orig_one, x_orig C M1 orig (x-1)&~x - add r9 = -1, x C I0 x-1 - ;; -}{.mmi; andcm r9 = r9, x C M0 (x-1)&~x - nop.m 0 C M1 - mov b0 = save_rp C I0 - ;; -}{.mii; nop.m 0 C M0 - popcnt x_orig = x_orig C I0 orig x twos - popcnt r9 = r9 C I0 x twos - ;; -}{.mmi; cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos - addl r22 = @ltoff(ctz_table), r1 - shr.u x = x, r9 C I0 x odd - ;; -}{.mib; - (p7) mov y_twos = x_orig C M0 common twos - add r10 = -1, y C I0 y-1 - (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y - ;; -} - mov r25 = m4_lshift(MASK, MAXSHIFT) - ld8 r22 = [r22] - br L(ent) - ;; - - ALIGN(32) -L(top): - .pred.rel "mutex", p6,p7 - {.mmi; (p7) mov y = x - (p6) sub x = x, y - dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits) -}{.mmi; and r20 = MASK, r19 - (p7) mov x = r19 - nop 0 - ;; -} -L(mid): -{.mmb; ld1 r16 = [r21] - cmp.eq p10,p0 = 0, r20 - (p10) br.spnt.few.clr L(shift_alot) - ;; -}{.mmi; nop 0 - nop 0 - shr.u x = x, r16 - ;; -} -L(ent): - {.mmi; sub r19 = y, x - cmp.gtu p6,p7 = x, y - cmp.ne p8,p0 = x, y -}{.mmb; nop 0 - nop 0 - (p8) br.sptk.few.clr L(top) -} - -L(done_y): C result is y - mov ar.pfs = save_pfs C I0 - shl r8 = y, y_twos C I common factors of 2 - br.ret.sptk.many b0 - -L(shift_alot): - and r20 = x, r25 - shr.u x = x, MAXSHIFT - ;; - dep r21 = x, r22, 0, MAXSHIFT - br L(mid) -EPILOGUE() diff --git a/mpn/powerpc64/mode64/gcd_1.asm b/mpn/powerpc64/mode64/gcd_1.asm deleted file mode 100644 index a20734416..000000000 --- a/mpn/powerpc64/mode64/gcd_1.asm +++ /dev/null @@ -1,125 +0,0 @@ -dnl PowerPC-64 mpn_gcd_1. - -dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/bit (approx) -C POWER3/PPC630 ? -C POWER4/PPC970 8.5 -C POWER5 ? -C POWER6 10.1 -C POWER7 9.4 -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C INPUT PARAMETERS -define(`up', `r3') -define(`n', `r4') -define(`v0', `r5') - -ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',, - `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)') - -EXTERN_FUNC(mpn_mod_1) -EXTERN_FUNC(mpn_modexact_1c_odd) - -ASM_START() -PROLOGUE(mpn_gcd_1,toc) - mflr r0 - std r30, -16(r1) - std r31, -8(r1) - std r0, 16(r1) - stdu r1, -128(r1) - - ld r7, 0(up) C U low limb - or r0, r5, r7 C x | y - - neg r6, r0 - and r6, r6, r0 - cntlzd r31, r6 C common twos - subfic r31, r31, 63 - - neg r6, r5 - and r6, r6, r5 - cntlzd r8, r6 - subfic r8, r8, 63 - srd r5, r5, r8 - mr r30, r5 C v0 saved - - cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD - blt L(bmod) - CALL( mpn_mod_1) - b L(reduced) -L(bmod): - li r6, 0 - CALL( mpn_modexact_1c_odd) -L(reduced): - -define(`mask', `r0')dnl -define(`a1', `r4')dnl -define(`a2', `r5')dnl -define(`d1', `r6')dnl -define(`d2', `r7')dnl -define(`cnt', `r9')dnl - - neg. r6, r3 - and r6, r6, r3 - cntlzd cnt, r6 - subfic cnt, cnt, 63 - li r12, 63 - bne L(mid) - b L(end) - - ALIGN(16) -L(top): - and a1, r10, mask C d - a - andc a2, r11, mask C a - d - and d1, r3, mask C a - andc d2, r30, mask C d - or r3, a1, a2 C new a - subf cnt, cnt, r12 - or r30, d1, d2 C new d -L(mid): srd r3, r3, cnt - sub. r10, r30, r3 C r10 = d - a - subc r11, r3, r30 C r11 = a - d - neg r8, r10 - and r8, r8, r10 - subfe mask, mask, mask - cntlzd cnt, r8 - bne L(top) - -L(end): sld r3, r30, r31 - - addi r1, r1, 128 - ld r0, 16(r1) - ld r30, -16(r1) - ld r31, -8(r1) - mtlr r0 - blr -EPILOGUE() diff --git a/mpn/powerpc64/mode64/p7/gcd_1.asm b/mpn/powerpc64/mode64/p7/gcd_1.asm deleted file mode 100644 index 47cb40bdc..000000000 --- a/mpn/powerpc64/mode64/p7/gcd_1.asm +++ /dev/null @@ -1,110 +0,0 @@ -dnl PowerPC-64 mpn_gcd_1. - -dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/bit (approx) -C POWER3/PPC630 - -C POWER4/PPC970 - -C POWER5 - -C POWER6 - -C POWER7 7.6 -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C INPUT PARAMETERS -define(`up', `r3') -define(`n', `r4') -define(`v0', `r5') - -EXTERN_FUNC(mpn_mod_1) -EXTERN_FUNC(mpn_modexact_1c_odd) - -ASM_START() -PROLOGUE(mpn_gcd_1,toc) - mflr r0 - std r30, -16(r1) - std r31, -8(r1) - std r0, 16(r1) - stdu r1, -128(r1) - - ld r7, 0(up) C U low limb - or r0, r5, r7 C x | y - - neg r6, r0 - and r6, r6, r0 - cntlzd r31, r6 C common twos - subfic r31, r31, 63 - - neg r6, r5 - and r6, r6, r5 - cntlzd r8, r6 - subfic r8, r8, 63 - srd r5, r5, r8 - mr r30, r5 C v0 saved - - cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD - blt L(bmod) - CALL( mpn_mod_1) - b L(reduced) -L(bmod): - li r6, 0 - CALL( mpn_modexact_1c_odd) -L(reduced): - -define(`cnt', `r9')dnl - - neg. r6, r3 - and r6, r6, r3 - cntlzd cnt, r6 - li r12, 63 - bne L(mid) - b L(end) - - ALIGN(16) -L(top): isel r30, r3, r30, 29 C y = min(x,y) - isel r3, r10, r11, 29 C x = |y - x| -L(mid): subf cnt, cnt, r12 C cnt = 63-cnt - srd r3, r3, cnt - subf r10, r3, r30 C r10 = y - x - subf r11, r30, r3 C r11 = x - y - cmpld cr7, r30, r3 - and r8, r11, r10 C isolate lsb - cntlzd cnt, r8 - bne cr7, L(top) - -L(end): sld r3, r30, r31 - - addi r1, r1, 128 - ld r0, 16(r1) - ld r30, -16(r1) - ld r31, -8(r1) - mtlr r0 - blr -EPILOGUE() diff --git a/mpn/powerpc64/mode64/p9/gcd_1.asm b/mpn/powerpc64/mode64/p9/gcd_1.asm deleted file mode 100644 index 286e48dc5..000000000 --- a/mpn/powerpc64/mode64/p9/gcd_1.asm +++ /dev/null @@ -1,101 +0,0 @@ -dnl PowerPC-64 mpn_gcd_1. - -dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, -dnl Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/bit (approx) -C POWER3/PPC630 - -C POWER4/PPC970 - -C POWER5 - -C POWER6 - -C POWER7 - -C POWER8 - -C POWER9 5.75 -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C INPUT PARAMETERS -define(`up', `r3') -define(`n', `r4') -define(`v0', `r5') - -EXTERN_FUNC(mpn_mod_1) -EXTERN_FUNC(mpn_modexact_1c_odd) - -ASM_START() -PROLOGUE(mpn_gcd_1,toc) - mflr r0 - std r30, -16(r1) - std r31, -8(r1) - std r0, 16(r1) - stdu r1, -128(r1) - - ld r7, 0(up) C U low limb - or r0, r5, r7 C x | y - cnttzd r31, r0 C common twos - cnttzd r8, r5 - srd r5, r5, r8 - mr r30, r5 C v0 saved - - cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD - blt L(bmod) - CALL( mpn_mod_1) - b L(reduced) -L(bmod): - li r6, 0 - CALL( mpn_modexact_1c_odd) -L(reduced): - -define(`cnt', `r9')dnl - - cmpdi r3, 0 - cnttzd cnt, r3 - bne L(mid) - b L(end) - - ALIGN(16) -L(top): isel r30, r3, r30, 29 C y = min(x,y) - isel r3, r10, r11, 29 C x = |y - x| -L(mid): srd r3, r3, cnt - subf r10, r3, r30 C r10 = y - x - subf r11, r30, r3 C r11 = x - y - cmpld cr7, r30, r3 - cnttzd cnt, r10 - bne cr7, L(top) - -L(end): sld r3, r30, r31 - - addi r1, r1, 128 - ld r0, 16(r1) - ld r30, -16(r1) - ld r31, -8(r1) - mtlr r0 - blr -EPILOGUE() diff --git a/mpn/sparc64/gcd_1.asm b/mpn/sparc64/gcd_1.asm deleted file mode 100644 index e4d8de6a2..000000000 --- a/mpn/sparc64/gcd_1.asm +++ /dev/null @@ -1,135 +0,0 @@ -dnl SPARC64 mpn_gcd_1. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for SPARC by Torbjörn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C UltraSPARC 1&2: 5.1 -C UltraSPARC 3: 5.0 -C UltraSPARC T1: 11.4 -C UltraSPARC T3: 10 -C UltraSPARC T4: 6 -C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1 - -C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. - -deflit(MAXSHIFT, 7) -deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) - - RODATA - TYPE(ctz_table,object) -ctz_table: - .byte MAXSHIFT -forloop(i,1,MASK, -` .byte m4_count_trailing_zeros(i) -') - SIZE(ctz_table,.-ctz_table) - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 14) - -C INPUT PARAMETERS -define(`up', `%i0') -define(`n', `%i1') -define(`v0', `%i2') - - -ASM_START() - REGISTER(%g2,#scratch) - REGISTER(%g3,#scratch) -PROLOGUE(mpn_gcd_1) - save %sp, -192, %sp - ldx [up+0], %g1 C U low limb - mov -1, %i4 - or v0, %g1, %g2 C x | y - -L(twos): - inc %i4 - andcc %g2, 1, %g0 - bz,a %xcc, L(twos) - srlx %g2, 1, %g2 - -L(divide_strip_y): - andcc v0, 1, %g0 - bz,a %xcc, L(divide_strip_y) - srlx v0, 1, v0 - - cmp n, 1 C if n > 1 we need - bnz %xcc, L(bmod) C to call bmod_1 - nop - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - srlx %g1, BMOD_THRES_LOG2, %g2 - cmp %g2, v0 - bleu %xcc, L(noreduce) - mov %g1, %o0 - -L(bmod): - mov up, %o0 - mov n, %o1 - mov v0, %o2 - call mpn_modexact_1c_odd - mov 0, %o3 - -L(noreduce): - - LEA64(ctz_table, i5, g4) - - cmp %o0, 0 - bnz %xcc, L(mid) - and %o0, MASK, %g3 C - - return %i7+8 - sllx %o2, %o4, %o0 C CAUTION: v0 alias for o2 - - ALIGN(16) -L(top): movcc %xcc, %l4, v0 C v = min(u,v) - movcc %xcc, %l2, %o0 C u = |v - u] -L(mid): ldub [%i5+%g3], %g5 C - brz,a,pn %g3, L(shift_alot) C - srlx %o0, MAXSHIFT, %o0 - srlx %o0, %g5, %l4 C new u, odd - subcc v0, %l4, %l2 C v - u, set flags for branch and movcc - sub %l4, v0, %o0 C u - v - bnz,pt %xcc, L(top) C - and %l2, MASK, %g3 C extract low MAXSHIFT bits from (v-u) - - return %i7+8 - sllx %o2, %o4, %o0 C CAUTION: v0 alias for o2 - -L(shift_alot): - b L(mid) - and %o0, MASK, %g3 C -EPILOGUE() diff --git a/mpn/x86/k6/gcd_1.asm b/mpn/x86/k6/gcd_1.asm deleted file mode 100644 index a45774d37..000000000 --- a/mpn/x86/k6/gcd_1.asm +++ /dev/null @@ -1,359 +0,0 @@ -dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd. - -dnl Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C K6: 9.5 cycles/bit (approx) 1x1 gcd -C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) - - -C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y); -C -C This code is nothing very special, but offers a speedup over what gcc 2.95 -C can do with mpn/generic/gcd_1.c. -C -C Future: -C -C Using a lookup table to count trailing zeros seems a touch quicker, but -C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used -C it too. - - -dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits -dnl bigger than y, then a division x%y is done to reduce it. -dnl -dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so -dnl there should be an advantage in the divl at about 4 or 5 bits, which is -dnl what's found. - -deflit(DIV_THRESHOLD, 5) - - -defframe(PARAM_LIMB, 12) -defframe(PARAM_SIZE, 8) -defframe(PARAM_SRC, 4) - - TEXT - ALIGN(16) - -PROLOGUE(mpn_gcd_1) -deflit(`FRAME',0) - - ASSERT(ne, `cmpl $0, PARAM_LIMB') - ASSERT(ae, `cmpl $1, PARAM_SIZE') - - - movl PARAM_SRC, %eax - pushl %ebx FRAME_pushl() - - movl PARAM_LIMB, %edx - movl $-1, %ecx - - movl (%eax), %ebx C src low limb - - movl %ebx, %eax C src low limb - orl %edx, %ebx - -L(common_twos): - shrl %ebx - incl %ecx - - jnc L(common_twos) C 1/4 chance on random data - shrl %cl, %edx C y - - cmpl $1, PARAM_SIZE - ja L(size_two_or_more) - - - ASSERT(nz, `orl %eax, %eax') C should have src limb != 0 - - shrl %cl, %eax C x - - - C Swap if necessary to make x>=y. Measures a touch quicker as a - C jump than a branch free calculation. - C - C eax x - C ebx - C ecx common twos - C edx y - - movl %eax, %ebx - cmpl %eax, %edx - - jb L(noswap) - movl %edx, %eax - - movl %ebx, %edx - movl %eax, %ebx -L(noswap): - - - C See if it's worth reducing x with a divl. - C - C eax x - C ebx x - C ecx common twos - C edx y - - shrl $DIV_THRESHOLD, %ebx - - cmpl %ebx, %edx - ja L(nodiv) - - - C Reduce x to x%y. - C - C eax x - C ebx - C ecx common twos - C edx y - - movl %edx, %ebx - xorl %edx, %edx - - divl %ebx - - orl %edx, %edx C y - nop C code alignment - - movl %ebx, %eax C x - jz L(done_shll) -L(nodiv): - - - C eax x - C ebx - C ecx common twos - C edx y - C esi - C edi - C ebp - -L(strip_y): - shrl %edx - jnc L(strip_y) - - leal 1(%edx,%edx), %edx - movl %ecx, %ebx C common twos - - leal 1(%eax), %ecx - jmp L(strip_x_and) - - -C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch -C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is -C only 1/4. -C -C A second computed %cl shift was tried, but that measured a touch slower -C than branching back. -C -C A branch-free abs(x-y) and min(x,y) calculation was tried, but that -C measured about 1 cycle/bit slower. - - C eax x - C ebx common twos - C ecx scratch - C edx y - - ALIGN(4) -L(swap): - addl %eax, %edx C x-y+y = x - negl %eax C -(x-y) = y-x - -L(strip_x): - shrl %eax C odd-odd = even, so always one to strip - ASSERT(nz) - -L(strip_x_leal): - leal 1(%eax), %ecx - -L(strip_x_and): - andl $1, %ecx C (x^1)&1 - - shrl %cl, %eax C shift if x even - - testb $1, %al - jz L(strip_x) - - ASSERT(nz,`testl $1, %eax') C x, y odd - ASSERT(nz,`testl $1, %edx') - - subl %edx, %eax - jb L(swap) - ja L(strip_x) - - - movl %edx, %eax - movl %ebx, %ecx - -L(done_shll): - shll %cl, %eax - popl %ebx - - ret - - -C ----------------------------------------------------------------------------- -C Two or more limbs. -C -C x={src,size} is reduced modulo y using either a plain mod_1 style -C remainder, or a modexact_1 style exact division. - -deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4)) - - ALIGN(8) -L(size_two_or_more): - C eax - C ebx - C ecx common twos - C edx y, without common twos - C esi - C edi - C ebp - -deflit(FRAME_TWO_OR_MORE, FRAME) - - pushl %edi defframe_pushl(SAVE_EDI) - movl PARAM_SRC, %ebx - -L(y_twos): - shrl %edx - jnc L(y_twos) - - movl %ecx, %edi C common twos - movl PARAM_SIZE, %ecx - - pushl %esi defframe_pushl(SAVE_ESI) - leal 1(%edx,%edx), %esi C y (odd) - - movl -4(%ebx,%ecx,4), %eax C src high limb - - cmpl %edx, %eax C carry if high<divisor - - sbbl %edx, %edx C -1 if high<divisor - - addl %edx, %ecx C skip one limb if high<divisor - andl %eax, %edx - - cmpl $MODEXACT_THRESHOLD, %ecx - jae L(modexact) - - -L(divide_top): - C eax scratch (quotient) - C ebx src - C ecx counter, size-1 to 1 - C edx carry (remainder) - C esi divisor (odd) - C edi - C ebp - - movl -4(%ebx,%ecx,4), %eax - divl %esi - loop L(divide_top) - - - movl %edx, %eax C x - movl %esi, %edx C y (odd) - - movl %edi, %ebx C common twos - popl %esi - - popl %edi - leal 1(%eax), %ecx - - orl %eax, %eax - jnz L(strip_x_and) - - - movl %ebx, %ecx - movl %edx, %eax - - shll %cl, %eax - popl %ebx - - ret - - - ALIGN(8) -L(modexact): - C eax - C ebx src ptr - C ecx size or size-1 - C edx - C esi y odd - C edi common twos - C ebp - - movl PARAM_SIZE, %eax - pushl %esi FRAME_pushl() - - pushl %eax FRAME_pushl() - - pushl %ebx FRAME_pushl() - -ifdef(`PIC_WITH_EBX',` - nop C code alignment - call L(movl_eip_ebx) - add $_GLOBAL_OFFSET_TABLE_, %ebx -') - CALL( mpn_modexact_1_odd) - - movl %esi, %edx C y odd - movl SAVE_ESI, %esi - - movl %edi, %ebx C common twos - movl SAVE_EDI, %edi - - addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp - orl %eax, %eax - - leal 1(%eax), %ecx - jnz L(strip_x_and) - - - movl %ebx, %ecx - movl %edx, %eax - - shll %cl, %eax - popl %ebx - - ret - - -ifdef(`PIC_WITH_EBX',` -L(movl_eip_ebx): - movl (%esp), %ebx - ret_internal -') - -EPILOGUE() diff --git a/mpn/x86/k7/gcd_1.asm b/mpn/x86/k7/gcd_1.asm deleted file mode 100644 index 479d2c2b7..000000000 --- a/mpn/x86/k7/gcd_1.asm +++ /dev/null @@ -1,193 +0,0 @@ -dnl x86 mpn_gcd_1 optimised for AMD K7. - -dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software -dnl Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C AMD K7 5.31 -C AMD K8,K9 5.33 -C AMD K10 5.30 -C AMD bd1 ? -C AMD bobcat 7.02 -C Intel P4-2 10.1 -C Intel P4-3/4 10.0 -C Intel P6/13 5.88 -C Intel core2 6.26 -C Intel NHM 6.83 -C Intel SBR 8.50 -C Intel atom 8.90 -C VIA nano ? -C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 - -C TODO -C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny. -C * Stream things better through registers, avoiding some copying. -C * For ELF, avoid putting GOT base in both ebx and esi. Needs special -C LEA/LEAL or else discrete code here. - -C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. - -deflit(MAXSHIFT, 6) -deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) - -DEF_OBJECT(ctz_table,64) - .byte MAXSHIFT -forloop(i,1,MASK, -` .byte m4_count_trailing_zeros(i) -') -END_OBJECT(ctz_table) - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`DIV_THRES_LOG2', 7) - - -define(`up', `%edi') -define(`n', `%esi') -define(`v0', `%edx') - - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - push %edi - push %esi - - mov 12(%esp), up - mov 16(%esp), n - mov 20(%esp), v0 - - mov (up), %eax C U low limb - or v0, %eax C x | y - mov $-1, %ecx - -L(twos): - inc %ecx - shr %eax - jnc L(twos) - - shr %cl, v0 - mov %ecx, %eax C common twos - -L(divide_strip_y): - shr v0 - jnc L(divide_strip_y) - adc v0, v0 - - push %eax - push v0 - - cmp $1, n - jnz L(reduce_nby1) - -C Both U and V are single limbs, reduce with div if u0 >> v0. - mov (up), %ecx - mov %ecx, %eax - shr $DIV_THRES_LOG2, %ecx - cmp %ecx, v0 - ja L(reduced) - - mov v0, %esi - xor %edx, %edx - div %esi - mov %edx, %eax - jmp L(reduced) - -L(reduce_nby1): -ifdef(`PIC_WITH_EBX',`dnl - push %ebx - add $-4, %esp - call L(movl_eip_ebx) - add $_GLOBAL_OFFSET_TABLE_, %ebx -') - push v0 C param 3 - push n C param 2 - push up C param 1 - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) - CALL( mpn_mod_1) - jmp L(called) -L(bmod): - CALL( mpn_modexact_1_odd) - -L(called): -ifdef(`PIC_WITH_EBX',`dnl - add $16, %esp C deallocate params - pop %ebx -',` - add $12, %esp C deallocate params -') -L(reduced): - pop %edx - - LEAL( ctz_table, %esi) - test %eax, %eax - mov %eax, %ecx - jnz L(mid) - jmp L(end) - - ALIGN(16) C K8 BC P4 NHM SBR -L(top): cmovc( %ecx, %eax) C if x-y < 0 0 - cmovc( %edi, %edx) C use x,y-x 0 -L(mid): and $MASK, %ecx C 0 - movzbl (%esi,%ecx), %ecx C 1 - jz L(shift_alot) C 1 - shr %cl, %eax C 3 - mov %eax, %edi C 4 - mov %edx, %ecx C 3 - sub %eax, %ecx C 4 - sub %edx, %eax C 4 - jnz L(top) C 5 - -L(end): pop %ecx - mov %edx, %eax - shl %cl, %eax - pop %esi - pop %edi - ret - -L(shift_alot): - shr $MAXSHIFT, %eax - mov %eax, %ecx - jmp L(mid) - -ifdef(`PIC_WITH_EBX',`dnl -L(movl_eip_ebx): - mov (%esp), %ebx - ret -') -EPILOGUE() -ASM_END() diff --git a/mpn/x86/p6/gcd_1.asm b/mpn/x86/p6/gcd_1.asm deleted file mode 100644 index eafbf4a79..000000000 --- a/mpn/x86/p6/gcd_1.asm +++ /dev/null @@ -1,161 +0,0 @@ -dnl x86 mpn_gcd_1 optimised for processors with fast BSF. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software -dnl Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C AMD K7 7.80 -C AMD K8,K9 7.79 -C AMD K10 4.08 -C AMD bd1 ? -C AMD bobcat 7.82 -C Intel P4-2 14.9 -C Intel P4-3/4 14.0 -C Intel P6/13 5.09 -C Intel core2 4.22 -C Intel NHM 5.00 -C Intel SBR 5.00 -C Intel atom 17.1 -C VIA nano ? -C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 6) - - -define(`up', `%edi') -define(`n', `%esi') -define(`v0', `%edx') - - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - push %edi - push %esi - - mov 12(%esp), up - mov 16(%esp), n - mov 20(%esp), v0 - - mov (up), %eax C U low limb - or v0, %eax - bsf %eax, %eax C min(ctz(u0),ctz(v0)) - - bsf v0, %ecx - shr %cl, v0 - - push %eax C preserve common twos over call - push v0 C preserve v0 argument over call - - cmp $1, n - jnz L(reduce_nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - mov (up), %ecx - mov %ecx, %eax - shr $BMOD_THRES_LOG2, %ecx - cmp %ecx, v0 - ja L(reduced) - jmp L(bmod) - -L(reduce_nby1): - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) -ifdef(`PIC_WITH_EBX',`dnl - push %ebx - add $-4, %esp - call L(movl_eip_to_ebx) - add $_GLOBAL_OFFSET_TABLE_, %ebx -') - push v0 C param 3 - push n C param 2 - push up C param 1 - CALL( mpn_mod_1) - jmp L(called) - -L(bmod): -ifdef(`PIC_WITH_EBX',`dnl - push %ebx - add $-4, %esp - call L(movl_eip_to_ebx) - add $_GLOBAL_OFFSET_TABLE_, %ebx -') - push v0 C param 3 - push n C param 2 - push up C param 1 - CALL( mpn_modexact_1_odd) - -L(called): -ifdef(`PIC_WITH_EBX',`dnl - add $16, %esp C deallocate params - pop %ebx -',` - add $12, %esp C deallocate params -') -L(reduced): - pop %edx - - bsf %eax, %ecx -C test %eax, %eax C FIXME: does this lower latency? - jnz L(mid) - jmp L(end) - - ALIGN(16) C K10 BD C2 NHM SBR -L(top): cmovc( %esi, %eax) C if x-y < 0 0,3 0,3 0,6 0,5 0,5 - cmovc( %edi, %edx) C use x,y-x 0,3 0,3 2,8 1,7 1,7 -L(mid): shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8 - mov %edx, %esi C 1 1 4 3 3 - sub %eax, %esi C 2 2 5 4 4 - bsf %esi, %ecx C 3 3 6 5 5 - mov %eax, %edi C 2 2 3 3 4 - sub %edx, %eax C 2 2 4 3 4 - jnz L(top) C - -L(end): pop %ecx - mov %edx, %eax - shl %cl, %eax - - pop %esi - pop %edi - ret - -ifdef(`PIC_WITH_EBX',`dnl -L(movl_eip_to_ebx): - mov (%esp), %ebx - ret -') -EPILOGUE() diff --git a/mpn/x86_64/bd1/gcd_1.asm b/mpn/x86_64/bd1/gcd_1.asm deleted file mode 100644 index 3d8e5c7ab..000000000 --- a/mpn/x86_64/bd1/gcd_1.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl AMD64 mpn_gcd_1. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_gcd_1) -include_mpn(`x86_64/core2/gcd_1.asm') diff --git a/mpn/x86_64/bd2/gcd_1.asm b/mpn/x86_64/bd2/gcd_1.asm deleted file mode 100644 index 42b71a7bc..000000000 --- a/mpn/x86_64/bd2/gcd_1.asm +++ /dev/null @@ -1,164 +0,0 @@ -dnl AMD64 mpn_gcd_1 optimised for AMD BD2-BD4, Zen. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software -dnl Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C AMD K8,K9 ? -C AMD K10 ? -C AMD bd1 ? -C AMD bd2 ? -C AMD bd3 ? -C AMD bd4 3.65 -C AMD bt1 ? -C AMD bt2 ? -C AMD zn1 3.5 -C AMD zn2 3.8 -C Intel P4 ? -C Intel core2 ? -C Intel NHM ? -C Intel SBR ? -C Intel IBR ? -C Intel HWL ? -C Intel BWL ? -C Intel SKL ? -C Intel atom ? -C Intel SLM ? -C Intel GLM ? -C Intel GLM+ ? -C VIA nano ? -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C TODO -C * Optimise inner-loop for specific CPUs. -C * Use DIV for 1-by-1 reductions, at least for some CPUs. - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 6) - -C INPUT PARAMETERS -define(`up', `%rdi') -define(`n', `%rsi') -define(`v0', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -IFDOS(`define(`STACK_ALLOC', 40)') -IFSTD(`define(`STACK_ALLOC', 8)') - -C Undo some configure cleverness. -C The problem is that C only defines the '1c' variant, and that configure -C therefore considers modexact_1c to be the base function. It then adds a -C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep -C gcd_1 exists without a corresponding cpudep mode1o. -ifdef(`WANT_FAT_BINARY', ` - define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')') - - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - FUNC_ENTRY(3) - mov (up), %rax C U low limb - or v0, %rax C x | y - bsf %rax, %rax C min(ctz(u0),ctz(v0)) - - bsf v0, %rcx - shr R8(%rcx), v0 - - push %rax C preserve common twos over call - - cmp $1, n - jnz L(reduce_nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - mov (up), %r8 - mov %r8, %rax - shr $BMOD_THRES_LOG2, %r8 - cmp %r8, v0 - ja L(reduced) - -L(bmod): - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - ASSERT(nz, `test $15, %rsp') - CALL( mpn_modexact_1_odd) - -L(called): - add $STACK_ALLOC, %rsp - pop v0 - -L(reduced): - bsf %rax, %rcx -C test %rax, %rax C FIXME: does this lower latency? - jnz L(mid) - jmp L(end) - -L(reduce_nby1): - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) - - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - ASSERT(nz, `test $15, %rsp') - CALL( mpn_mod_1) - jmp L(called) - - ALIGN(16) C K10 BD1 BD2 ZEN CNR NHM SBR -L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,3 0,3 0,6 0,5 0,5 - cmovc %r9, v0 C use x,y-x 0,3 0,3 0,3 0,3 2,8 1,7 1,7 -L(mid): shr R8(%rcx), %rax C 1,7 1,6 1,5 1,4 2,8 2,8 2,8 - mov v0, %r10 C 1 1 1 1 4 3 3 - sub %rax, %r10 C 2 2 2 1 5 4 4 - rep;bsf %r10, %rcx C tzcnt! 3 3 3 2 6 5 5 - mov %rax, %r9 C 2 2 2 2 3 3 4 - sub v0, %rax C 2 2 2 2 4 3 4 - jnz L(top) C - -L(end): pop %rcx C common twos - mov v0, %rax - shl R8(%rcx), %rax - FUNC_EXIT() - ret -EPILOGUE() diff --git a/mpn/x86_64/bt2/gcd_1.asm b/mpn/x86_64/bt2/gcd_1.asm deleted file mode 100644 index 133d98363..000000000 --- a/mpn/x86_64/bt2/gcd_1.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl AMD64 mpn_gcd_1. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_gcd_1) -include_mpn(`x86_64/bd2/gcd_1.asm') diff --git a/mpn/x86_64/core2/gcd_1.asm b/mpn/x86_64/core2/gcd_1.asm deleted file mode 100644 index 52425a669..000000000 --- a/mpn/x86_64/core2/gcd_1.asm +++ /dev/null @@ -1,151 +0,0 @@ -dnl AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software -dnl Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C AMD K8,K9 8.50 -C AMD K10 4.30 -C AMD bd1 5.00 -C AMD bobcat 10.0 -C Intel P4 18.6 -C Intel core2 3.83 -C Intel NHM 5.17 -C Intel SBR 4.69 -C Intel atom 17.0 -C VIA nano 5.44 -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C TODO -C * Optimise inner-loop for specific CPUs. -C * Use DIV for 1-by-1 reductions, at least for some CPUs. - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 6) - -C INPUT PARAMETERS -define(`up', `%rdi') -define(`n', `%rsi') -define(`v0', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -IFDOS(`define(`STACK_ALLOC', 40)') -IFSTD(`define(`STACK_ALLOC', 8)') - -C Undo some configure cleverness. -C The problem is that C only defines the '1c' variant, and that configure -C therefore considers modexact_1c to be the base function. It then adds a -C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep -C gcd_1 exists without a corresponding cpudep mode1o. -ifdef(`WANT_FAT_BINARY', ` - define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')') - - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - FUNC_ENTRY(3) - mov (up), %rax C U low limb - or v0, %rax C x | y - bsf %rax, %rax C min(ctz(u0),ctz(v0)) - - bsf v0, %rcx - shr R8(%rcx), v0 - - push %rax C preserve common twos over call - - cmp $1, n - jnz L(reduce_nby1) - -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - mov (up), %r8 - mov %r8, %rax - shr $BMOD_THRES_LOG2, %r8 - cmp %r8, v0 - ja L(reduced) - -L(bmod): - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - ASSERT(nz, `test $15, %rsp') - CALL( mpn_modexact_1_odd) - -L(called): - add $STACK_ALLOC, %rsp - pop v0 - -L(reduced): - bsf %rax, %rcx -C test %rax, %rax C FIXME: does this lower latency? - jnz L(mid) - jmp L(end) - -L(reduce_nby1): - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) - - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - ASSERT(nz, `test $15, %rsp') - CALL( mpn_mod_1) - jmp L(called) - - ALIGN(16) C K10 BD C2 NHM SBR -L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5 - cmovc %r9, v0 C use x,y-x 0,3 0,3 2,8 1,7 1,7 -L(mid): shr R8(%rcx), %rax C 1,7 1,6 2,8 2,8 2,8 - mov v0, %r10 C 1 1 4 3 3 - sub %rax, %r10 C 2 2 5 4 4 - bsf %r10, %rcx C 3 3 6 5 5 - mov %rax, %r9 C 2 2 3 3 4 - sub v0, %rax C 2 2 4 3 4 - jnz L(top) C - -L(end): pop %rcx C common twos - mov v0, %rax - shl R8(%rcx), %rax - FUNC_EXIT() - ret -EPILOGUE() diff --git a/mpn/x86_64/gcd_1.asm b/mpn/x86_64/gcd_1.asm deleted file mode 100644 index 65eba6960..000000000 --- a/mpn/x86_64/gcd_1.asm +++ /dev/null @@ -1,170 +0,0 @@ -dnl AMD64 mpn_gcd_1 -- mpn by 1 gcd. - -dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn -dnl Granlund. - -dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software -dnl Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - - -C cycles/bit (approx) -C AMD K8,K9 5.21 (4.95) -C AMD K10 5.15 (5.00) -C AMD bd1 5.42 (5.14) -C AMD bobcat 6.71 (6.56) -C Intel P4 13.5 (12.75) -C Intel core2 6.20 (6.16) -C Intel NHM 6.49 (6.25) -C Intel SBR 7.75 (7.57) -C Intel atom 8.77 (8.54) -C VIA nano 6.60 (6.20) -C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 - -C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. - -deflit(MAXSHIFT, 7) -deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) - -DEF_OBJECT(ctz_table,64) - .byte MAXSHIFT -forloop(i,1,MASK, -` .byte m4_count_trailing_zeros(i) -') -END_OBJECT(ctz_table) - -C Threshold of when to call bmod when U is one limb. Should be about -C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit). -define(`BMOD_THRES_LOG2', 8) - -C INPUT PARAMETERS -define(`up', `%rdi') -define(`n', `%rsi') -define(`v0', `%rdx') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -IFDOS(`define(`STACK_ALLOC', 40)') -IFSTD(`define(`STACK_ALLOC', 8)') - -ASM_START() - TEXT - ALIGN(16) -PROLOGUE(mpn_gcd_1) - FUNC_ENTRY(3) - mov (up), %rax C U low limb - mov $-1, R32(%rcx) - or v0, %rax C x | y - -L(twos): - inc R32(%rcx) - shr %rax - jnc L(twos) - - shr R8(%rcx), v0 - push %rcx C common twos - -L(divide_strip_y): - shr v0 - jnc L(divide_strip_y) - adc v0, v0 - - cmp $1, n -ifelse(BMOD_1_TO_MOD_1_THRESHOLD, MP_SIZE_T_MAX,` - jnz L(bmod) -',` - jnz L(reduce_nby1) -') -C Both U and V are single limbs, reduce with bmod if u0 >> v0. - mov (up), %r8 - mov %r8, %rax - shr $BMOD_THRES_LOG2, %r8 - cmp %r8, v0 - ja L(reduced) - -L(bmod): - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - ASSERT(nz, `test $15, %rsp') - CALL( mpn_modexact_1_odd) - -L(called): - add $STACK_ALLOC, %rsp - pop v0 - -L(reduced): - LEA( ctz_table, %rsi) - test %rax, %rax - mov %rax, %rcx - jnz L(mid) - jmp L(end) - -ifelse(BMOD_1_TO_MOD_1_THRESHOLD, `MP_SIZE_T_MAX',,` -L(reduce_nby1): - cmp $BMOD_1_TO_MOD_1_THRESHOLD, n - jl L(bmod) - - push v0 C preserve v0 argument over call - sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment -IFDOS(` mov %rdx, %r8 ') -IFDOS(` mov %rsi, %rdx ') -IFDOS(` mov %rdi, %rcx ') - ASSERT(nz, `test $15, %rsp') - CALL( mpn_mod_1) - jmp L(called) -') - ALIGN(16) C K8 BC P4 NHM SBR -L(top): cmovc %rcx, %rax C if x-y < 0 0 - cmovc %rdi, v0 C use x,y-x 0 -L(mid): and $MASK, R32(%rcx) C 0 - movzbl (%rsi,%rcx), R32(%rcx) C 1 - jz L(shift_alot) C 1 - shr R8(%rcx), %rax C 3 - mov %rax, %rdi C 4 - mov v0, %rcx C 3 - sub %rax, %rcx C 4 - sub v0, %rax C 4 - jnz L(top) C - -L(end): pop %rcx - mov v0, %rax - shl R8(%rcx), %rax - FUNC_EXIT() - ret - -L(shift_alot): - shr $MAXSHIFT, %rax - mov %rax, %rcx - jmp L(mid) -EPILOGUE() diff --git a/mpn/x86_64/k10/gcd_1.asm b/mpn/x86_64/k10/gcd_1.asm deleted file mode 100644 index 3d8e5c7ab..000000000 --- a/mpn/x86_64/k10/gcd_1.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl AMD64 mpn_gcd_1. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_gcd_1) -include_mpn(`x86_64/core2/gcd_1.asm') diff --git a/mpn/x86_64/nano/gcd_1.asm b/mpn/x86_64/nano/gcd_1.asm deleted file mode 100644 index 3d8e5c7ab..000000000 --- a/mpn/x86_64/nano/gcd_1.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl AMD64 mpn_gcd_1. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_gcd_1) -include_mpn(`x86_64/core2/gcd_1.asm') diff --git a/mpn/x86_64/zen/gcd_1.asm b/mpn/x86_64/zen/gcd_1.asm deleted file mode 100644 index 133d98363..000000000 --- a/mpn/x86_64/zen/gcd_1.asm +++ /dev/null @@ -1,37 +0,0 @@ -dnl AMD64 mpn_gcd_1. - -dnl Copyright 2012 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of either: -dnl -dnl * the GNU Lesser General Public License as published by the Free -dnl Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl or -dnl -dnl * the GNU General Public License as published by the Free Software -dnl Foundation; either version 2 of the License, or (at your option) any -dnl later version. -dnl -dnl or both in parallel, as here. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -dnl for more details. -dnl -dnl You should have received copies of the GNU General Public License and the -dnl GNU Lesser General Public License along with the GNU MP Library. If not, -dnl see https://www.gnu.org/licenses/. - -include(`../config.m4') - -ABI_SUPPORT(DOS64) -ABI_SUPPORT(STD64) - -MULFUNC_PROLOGUE(mpn_gcd_1) -include_mpn(`x86_64/bd2/gcd_1.asm') |