summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2019-11-29 00:53:11 +0100
committerTorbjorn Granlund <tg@gmplib.org>2019-11-29 00:53:11 +0100
commit175570b422747d6a7e7ec141e5c04197e0c7d876 (patch)
tree47c2f0d2c00835224d3d0abfa636e6b8806689e5
parent3c9499b33207fab15aa61f90c8537f4a1514a416 (diff)
downloadgmp-175570b422747d6a7e7ec141e5c04197e0c7d876.tar.gz
Remove all gcd_1.asm files.
-rw-r--r--mpn/alpha/ev67/gcd_1.asm145
-rw-r--r--mpn/arm/v5/gcd_1.asm123
-rw-r--r--mpn/arm/v6t2/gcd_1.asm118
-rw-r--r--mpn/arm64/gcd_1.asm125
-rw-r--r--mpn/ia64/gcd_1.asm238
-rw-r--r--mpn/powerpc64/mode64/gcd_1.asm125
-rw-r--r--mpn/powerpc64/mode64/p7/gcd_1.asm110
-rw-r--r--mpn/powerpc64/mode64/p9/gcd_1.asm101
-rw-r--r--mpn/sparc64/gcd_1.asm135
-rw-r--r--mpn/x86/k6/gcd_1.asm359
-rw-r--r--mpn/x86/k7/gcd_1.asm193
-rw-r--r--mpn/x86/p6/gcd_1.asm161
-rw-r--r--mpn/x86_64/bd1/gcd_1.asm37
-rw-r--r--mpn/x86_64/bd2/gcd_1.asm164
-rw-r--r--mpn/x86_64/bt2/gcd_1.asm37
-rw-r--r--mpn/x86_64/core2/gcd_1.asm151
-rw-r--r--mpn/x86_64/gcd_1.asm170
-rw-r--r--mpn/x86_64/k10/gcd_1.asm37
-rw-r--r--mpn/x86_64/nano/gcd_1.asm37
-rw-r--r--mpn/x86_64/zen/gcd_1.asm37
20 files changed, 0 insertions, 2603 deletions
diff --git a/mpn/alpha/ev67/gcd_1.asm b/mpn/alpha/ev67/gcd_1.asm
deleted file mode 100644
index 55fa7d367..000000000
--- a/mpn/alpha/ev67/gcd_1.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-dnl Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
-
-dnl Copyright 2003, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C ev67: 3.4 cycles/bitpair for 1x1 part
-
-
-C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
-C
-C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
-C strip trailing zeros from abs(x-y) to maintain x and y both odd.
-C
-C The trailing zeros are calculated from just x-y, since in twos-complement
-C there's the same number of trailing zeros on d or -d. This means the cttz
-C runs in parallel with abs(x-y).
-C
-C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
-C operands with this algorithm gives the measured 3.4 c/l.
-C
-C The slottings shown are for SVR4 style systems, Unicos differs in the
-C initial gp setup and the LEA.
-C
-C Enhancement:
-C
-C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
-C it to a bsr, but probably only in a static binary. Plain "jsr foo" gives
-C the right object code for relaxation, and ought to be available
-C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
-C the usual case of running in a shared library.
-C
-C bsr could perhaps be used explicitly anyway. We should be able to assume
-C modexact is in the same module as us (ie. shared library or mainline).
-C Would there be any worries about the size of the displacement? Could
-C always put modexact and gcd_1 in the same .o to be certain.
-
-ASM_START()
-PROLOGUE(mpn_gcd_1, gp)
-
- C r16 xp
- C r17 size
- C r18 y
-
- C ldah C l
- C lda C u
-
- ldq r0, 0(r16) C L x = xp[0]
- lda r30, -32(r30) C u alloc stack
-
- LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldq (gp)
- stq r10, 16(r30) C L save r10
- cttz r18, r10 C U0 y twos
- cmpeq r17, 1, r5 C u test size==1
-
- stq r9, 8(r30) C L save r9
- clr r19 C u zero c for modexact
- unop
- unop
-
- cttz r0, r6 C U0 x twos
- stq r26, 0(r30) C L save ra
-
- srl r18, r10, r18 C U y odd
-
- mov r18, r9 C l hold y across call
-
- cmpult r6, r10, r2 C u test x_twos < y_twos
-
- cmovne r2, r6, r10 C l common_twos = min(x_twos,y_twos)
- bne r5, L(one) C U no modexact if size==1
- jsr r26, (r27), mpn_modexact_1c_odd C L0
-
- LDGP( r29, 0(r26)) C u,l ldah,lda
- cttz r0, r6 C U0 new x twos
- ldq r26, 0(r30) C L restore ra
-
-L(one):
- mov r9, r1 C u y
- ldq r9, 8(r30) C L restore r9
- mov r10, r2 C u common twos
- ldq r10, 16(r30) C L restore r10
-
- lda r30, 32(r30) C l free stack
- beq r0, L(done) C U return y if x%y==0
-
- srl r0, r6, r0 C U x odd
- unop
-
- ALIGN(16)
-L(top):
- C r0 x
- C r1 y
- C r2 common twos, for use at end
-
- subq r0, r1, r7 C l0 d = x - y
- cmpult r0, r1, r16 C u0 test x >= y
-
- subq r1, r0, r4 C l0 new_x = y - x
- cttz r7, r8 C U0 d twos
-
- cmoveq r16, r7, r4 C l0 new_x = d if x>=y
- cmovne r16, r0, r1 C u0 y = x if x<y
- unop C l \ force cmoveq into l0
- unop C u /
-
- C C cmoveq2 L0, cmovne2 U0
-
- srl r4, r8, r0 C U0 x = new_x >> twos
- bne r7, L(top) C U1 stop when d==0
-
-
-L(done):
- sll r1, r2, r0 C U0 return y << common_twos
- ret r31, (r26), 1 C L0
-
-EPILOGUE()
-ASM_END()
diff --git a/mpn/arm/v5/gcd_1.asm b/mpn/arm/v5/gcd_1.asm
deleted file mode 100644
index 7897544b9..000000000
--- a/mpn/arm/v5/gcd_1.asm
+++ /dev/null
@@ -1,123 +0,0 @@
-dnl ARM v5 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C StrongARM -
-C XScale ?
-C Cortex-A5 6.45
-C Cortex-A7 6.41
-C Cortex-A8 5.0
-C Cortex-A9 5.9
-C Cortex-A15 4.40
-C Cortex-A17 5.68
-C Cortex-A53 4.37
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop better.
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up', `r0')
-define(`n', `r1')
-define(`v0', `r2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push {r4, r7, lr}
- ldr r3, [up] C U low limb
-
- orr r3, r3, v0
- rsb r4, r3, #0
- and r4, r4, r3
- clz r4, r4 C min(ctz(u0),ctz(v0))
- rsb r4, r4, #31
-
- rsb r12, v0, #0
- and r12, r12, v0
- clz r12, r12
- rsb r12, r12, #31
- mov v0, v0, lsr r12
-
- mov r7, v0
-
- cmp n, #1
- bne L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- ldr r3, [up]
- cmp v0, r3, lsr #BMOD_THRES_LOG2
- bhi L(red1)
-
-L(bmod):mov r3, #0 C carry argument
- bl mpn_modexact_1c_odd
- b L(red0)
-
-L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
- blo L(bmod)
-
- bl mpn_mod_1
-
-L(red0):mov r3, r0
-L(red1):rsbs r12, r3, #0
- and r12, r12, r3
- clz r12, r12
- rsb r12, r12, #31
- bne L(mid)
- b L(end)
-
- ALIGN(8)
-L(top): rsb r12, r12, #31
- movcc r3, r1 C if x-y < 0
- movcc r7, r0 C use x,y-x
-L(mid): mov r3, r3, lsr r12 C
- mov r0, r3 C
- sub r1, r7, r3 C
- rsbs r3, r7, r3 C
- and r12, r1, r3 C
- clz r12, r12 C
- bne L(top) C
-
-L(end): mov r0, r7, lsl r4
- pop {r4, r7, pc}
-EPILOGUE()
diff --git a/mpn/arm/v6t2/gcd_1.asm b/mpn/arm/v6t2/gcd_1.asm
deleted file mode 100644
index 120a955f3..000000000
--- a/mpn/arm/v6t2/gcd_1.asm
+++ /dev/null
@@ -1,118 +0,0 @@
-dnl ARM v6t2 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C StrongARM -
-C XScale -
-C Cortex-A5 5.75
-C Cortex-A7 6.38
-C Cortex-A8 5.0
-C Cortex-A9 5.3
-C Cortex-A15 2.92
-C Cortex-A17 5.63
-C Cortex-A53 4.25
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop better.
-C * Push saving/restoring of callee-user regs into call code
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 7)
-
-C INPUT PARAMETERS
-define(`up', `r0')
-define(`n', `r1')
-define(`v0', `r2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push {r4, r7, lr}
- ldr r3, [up] C U low limb
-
- orr r3, r3, v0
- rbit r4, r3
- clz r4, r4 C min(ctz(u0),ctz(v0))
-
- rbit r12, v0
- clz r12, r12
- mov v0, v0, lsr r12
-
- mov r7, v0
-
- cmp n, #1
- bne L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- ldr r3, [up]
- cmp v0, r3, lsr #BMOD_THRES_LOG2
- bhi L(red1)
-
-L(bmod):mov r3, #0 C carry argument
- bl mpn_modexact_1c_odd
- b L(red0)
-
-L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
- blo L(bmod)
-
- bl mpn_mod_1
-
-L(red0):mov r3, r0
-L(red1):cmp r3, #0
- rbit r12, r3
- clz r12, r12
- bne L(mid)
- b L(end)
-
- ALIGN(8)
-L(top): movcs r3, r1 C if x-y < 0
- movcs r7, r0 C use x,y-x
-L(mid): mov r3, r3, lsr r12 C
- mov r0, r3 C
- subs r1, r7, r3 C
- rsb r3, r7, r3 C
- rbit r12, r1
- clz r12, r12 C
- bne L(top) C
-
-L(end): mov r0, r7, lsl r4
- pop {r4, r7, pc}
-EPILOGUE()
diff --git a/mpn/arm64/gcd_1.asm b/mpn/arm64/gcd_1.asm
deleted file mode 100644
index 55513b31d..000000000
--- a/mpn/arm64/gcd_1.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-dnl ARM v8a mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-changecom(blah)
-
-C cycles/bit (approx)
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-C TODO
-C * Optimise inner-loop better.
-C * Push saving/restoring of callee-user regs into call code
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 7)
-
-C INPUT PARAMETERS
-define(`up', `x0')
-define(`n', `x1')
-define(`v0', `x2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- stp x29, x30, [sp,#-32]!
- ldr x3, [up] C U low limb
- stp x19, x20, [sp,#16]
-
- orr x3, x3, v0
- rbit x4, x3
- clz x20, x4 C min(ctz(u0),ctz(v0))
-
- rbit x12, v0
- clz x12, x12
- lsr v0, v0, x12
-
- mov x19, v0
-
- cmp n, #1
- b.ne L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- ldr x3, [up]
- cmp v0, x3, lsr #BMOD_THRES_LOG2
- b.hi L(red1)
-
-L(bmod):mov x3, #0 C carry argument
- bl mpn_modexact_1c_odd
- b L(red0)
-
-L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
- b.lo L(bmod)
-
- bl mpn_mod_1
-
-L(red0):mov x3, x0
-L(red1):cmp x3, #0
- rbit x12, x3
- clz x12, x12
- b.ne L(mid)
- b L(end)
-
- ALIGN(8)
-L(top):
-ifelse(1,1,`
-C This shorter variant makes full use of armv8 insns
- csneg x3, x1, x1, cs C if x-y < 0
- csel x19, x4, x19, cs C use x,y-x
-L(mid): lsr x4, x3, x12 C
- subs x1, x19, x4 C
-',`
-C This variant is akin to the 32-bit v6t2 code
- csel x3, x1, x3, cs C if x-y < 0
- csel x19, x0, x19, cs C use x,y-x
-L(mid): lsr x3, x3, x12 C
- mov x0, x3 C
- subs x1, x19, x3 C
- sub x3, x3, x19 C
-')
- rbit x12, x1
- clz x12, x12 C
- b.ne L(top) C
-
-L(end): lsl x0, x19, x20
- ldp x19, x20, [sp,#16]
- ldp x29, x30, [sp],#32
- ret
-EPILOGUE()
diff --git a/mpn/ia64/gcd_1.asm b/mpn/ia64/gcd_1.asm
deleted file mode 100644
index 3afabd706..000000000
--- a/mpn/ia64/gcd_1.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2002-2005, 2012, 2013, 2015 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bitpair (1x1 gcd)
-C Itanium: ?
-C Itanium 2: 5.1
-
-
-C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
-C
-C The entry sequence is designed to expect xsize>1 and hence a modexact
-C call. This ought to be more common than a 1x1 operation. Our critical
-C path is thus stripping factors of 2 from y, calling modexact, then
-C stripping factors of 2 from the x remainder returned.
-C
-C The common factors of 2 between x and y must be determined using the
-C original x, not the remainder from the modexact. This is done with
-C x_orig which is xp[0]. There's plenty of time to do this while the rest
-C of the modexact etc is happening.
-C
-C It's possible xp[0] is zero. In this case the trailing zeros calculation
-C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
-C have, making min(x_twos,y_twos) == y_twos.
-C
-C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
-C stripping factors of 2 from abs(x-y). Those factors of two are
-C determined from just y-x, without the abs(), since there's the same
-C number of trailing zeros on n or -n in twos complement. That makes the
-C dependent chain 8 cycles deep.
-C
-C The selection of x-y versus y-x for abs(x-y), and the selection of the
-C minimum of x and y, is done in parallel with the critical path.
-C
-C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
-C average, hence the final 5.8 cycles/bitpair.
-C
-C Not done:
-C
-C An alternate algorithm which didn't strip all twos, but instead applied
-C tbit and predicated extr on x, and then y, was attempted. The loop was 6
-C cycles, but the algorithm is an average 1.25 iterations per bitpair for a
-C total 7.25 c/bp, which is slower than the current approach.
-C
-C Alternatives:
-C
-C Perhaps we could do something tricky by extracting a few high bits and a
-C few low bits from the operands, and looking up a table which would give a
-C set of predicates to control some shifts or subtracts or whatever. That
-C could knock off multiple bits per iteration.
-C
-C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr
-C only going down I0), perhaps it'd be possible to shift left instead,
-C using add. That would mean keeping track of the lowest not-yet-zeroed
-C bit, using some sort of mask.
-C
-C TODO:
-C * Once mod_1_N exists in assembly for Itanium, add conditional calls.
-C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
-C * Probably avoid popcnt also outside of loop, instead use ctz_table.
-
-ASM_START()
- .explicit C What does this mean?
-
-C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
- .global mpn_modexact_1c_odd
- .type mpn_modexact_1c_odd,@function
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-C .section ".rodata"
- .rodata
- ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep
-ctz_table:
- data1 MAXSHIFT
-forloop(i,1,MASK,
-` data1 m4_count_trailing_zeros(i)
-')
-
-PROLOGUE(mpn_gcd_1)
-
- C r32 xp
- C r33 xsize
- C r34 y
-
-define(x, r8)
-define(xp_orig, r32)
-define(xsize, r33)
-define(y, r34) define(inputs, 3)
-define(save_rp, r35)
-define(save_pfs, r36)
-define(x_orig, r37)
-define(x_orig_one, r38)
-define(y_twos, r39) define(locals, 5)
-define(out_xp, r40)
-define(out_xsize, r41)
-define(out_divisor, r42)
-define(out_carry, r43) define(outputs, 4)
-
- .prologue
- {.mmi;
-ifdef(`HAVE_ABI_32',
-` addp4 r9 = 0, xp_orig define(xp,r9)', C M0
-` define(xp,xp_orig)')
- .save ar.pfs, save_pfs
- alloc save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2
- .save rp, save_rp
- mov save_rp = b0 C I0
-}{.mbb; .body
- add r10 = -1, y C M3 y-1
- nop.b 0 C B0
- nop.b 0 C B1
- ;;
-
-}{.mmi; ld8 x = [xp] C M0 x = xp[0] if no modexact
- ld8 x_orig = [xp] C M1 orig x for common twos
- cmp.ne p6,p0 = 1, xsize C I0
-}{.mmi; andcm y_twos = r10, y C M2 (y-1)&~y
- mov out_xp = xp_orig C M3
- mov out_xsize = xsize C I1
- ;;
-}{.mmi; mov out_carry = 0 C M0
- nop.m 0 C M1
- popcnt y_twos = y_twos C I0 y twos
- ;;
-}{.mmi; add x_orig_one = -1, x_orig C M0 orig x-1
- nop.m 0 C M1
- shr.u out_divisor = y, y_twos C I0 y without twos
-}{.mib; nop.m 0 C M2
- shr.u y = y, y_twos C I1 y without twos
- (p6) br.call.sptk.many b0 = mpn_modexact_1c_odd C if xsize>1
- ;;
-}
- C modexact can leave x==0
- {.mmi; cmp.eq p6,p0 = 0, x C M0 if {xp,xsize} % y == 0
- andcm x_orig = x_orig_one, x_orig C M1 orig (x-1)&~x
- add r9 = -1, x C I0 x-1
- ;;
-}{.mmi; andcm r9 = r9, x C M0 (x-1)&~x
- nop.m 0 C M1
- mov b0 = save_rp C I0
- ;;
-}{.mii; nop.m 0 C M0
- popcnt x_orig = x_orig C I0 orig x twos
- popcnt r9 = r9 C I0 x twos
- ;;
-}{.mmi; cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos
- addl r22 = @ltoff(ctz_table), r1
- shr.u x = x, r9 C I0 x odd
- ;;
-}{.mib;
- (p7) mov y_twos = x_orig C M0 common twos
- add r10 = -1, y C I0 y-1
- (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y
- ;;
-}
- mov r25 = m4_lshift(MASK, MAXSHIFT)
- ld8 r22 = [r22]
- br L(ent)
- ;;
-
- ALIGN(32)
-L(top):
- .pred.rel "mutex", p6,p7
- {.mmi; (p7) mov y = x
- (p6) sub x = x, y
- dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits)
-}{.mmi; and r20 = MASK, r19
- (p7) mov x = r19
- nop 0
- ;;
-}
-L(mid):
-{.mmb; ld1 r16 = [r21]
- cmp.eq p10,p0 = 0, r20
- (p10) br.spnt.few.clr L(shift_alot)
- ;;
-}{.mmi; nop 0
- nop 0
- shr.u x = x, r16
- ;;
-}
-L(ent):
- {.mmi; sub r19 = y, x
- cmp.gtu p6,p7 = x, y
- cmp.ne p8,p0 = x, y
-}{.mmb; nop 0
- nop 0
- (p8) br.sptk.few.clr L(top)
-}
-
-L(done_y): C result is y
- mov ar.pfs = save_pfs C I0
- shl r8 = y, y_twos C I common factors of 2
- br.ret.sptk.many b0
-
-L(shift_alot):
- and r20 = x, r25
- shr.u x = x, MAXSHIFT
- ;;
- dep r21 = x, r22, 0, MAXSHIFT
- br L(mid)
-EPILOGUE()
diff --git a/mpn/powerpc64/mode64/gcd_1.asm b/mpn/powerpc64/mode64/gcd_1.asm
deleted file mode 100644
index a20734416..000000000
--- a/mpn/powerpc64/mode64/gcd_1.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-dnl PowerPC-64 mpn_gcd_1.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C POWER3/PPC630 ?
-C POWER4/PPC970 8.5
-C POWER5 ?
-C POWER6 10.1
-C POWER7 9.4
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up', `r3')
-define(`n', `r4')
-define(`v0', `r5')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
- mflr r0
- std r30, -16(r1)
- std r31, -8(r1)
- std r0, 16(r1)
- stdu r1, -128(r1)
-
- ld r7, 0(up) C U low limb
- or r0, r5, r7 C x | y
-
- neg r6, r0
- and r6, r6, r0
- cntlzd r31, r6 C common twos
- subfic r31, r31, 63
-
- neg r6, r5
- and r6, r6, r5
- cntlzd r8, r6
- subfic r8, r8, 63
- srd r5, r5, r8
- mr r30, r5 C v0 saved
-
- cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
- blt L(bmod)
- CALL( mpn_mod_1)
- b L(reduced)
-L(bmod):
- li r6, 0
- CALL( mpn_modexact_1c_odd)
-L(reduced):
-
-define(`mask', `r0')dnl
-define(`a1', `r4')dnl
-define(`a2', `r5')dnl
-define(`d1', `r6')dnl
-define(`d2', `r7')dnl
-define(`cnt', `r9')dnl
-
- neg. r6, r3
- and r6, r6, r3
- cntlzd cnt, r6
- subfic cnt, cnt, 63
- li r12, 63
- bne L(mid)
- b L(end)
-
- ALIGN(16)
-L(top):
- and a1, r10, mask C d - a
- andc a2, r11, mask C a - d
- and d1, r3, mask C a
- andc d2, r30, mask C d
- or r3, a1, a2 C new a
- subf cnt, cnt, r12
- or r30, d1, d2 C new d
-L(mid): srd r3, r3, cnt
- sub. r10, r30, r3 C r10 = d - a
- subc r11, r3, r30 C r11 = a - d
- neg r8, r10
- and r8, r8, r10
- subfe mask, mask, mask
- cntlzd cnt, r8
- bne L(top)
-
-L(end): sld r3, r30, r31
-
- addi r1, r1, 128
- ld r0, 16(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- mtlr r0
- blr
-EPILOGUE()
diff --git a/mpn/powerpc64/mode64/p7/gcd_1.asm b/mpn/powerpc64/mode64/p7/gcd_1.asm
deleted file mode 100644
index 47cb40bdc..000000000
--- a/mpn/powerpc64/mode64/p7/gcd_1.asm
+++ /dev/null
@@ -1,110 +0,0 @@
-dnl PowerPC-64 mpn_gcd_1.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C POWER3/PPC630 -
-C POWER4/PPC970 -
-C POWER5 -
-C POWER6 -
-C POWER7 7.6
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up', `r3')
-define(`n', `r4')
-define(`v0', `r5')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
- mflr r0
- std r30, -16(r1)
- std r31, -8(r1)
- std r0, 16(r1)
- stdu r1, -128(r1)
-
- ld r7, 0(up) C U low limb
- or r0, r5, r7 C x | y
-
- neg r6, r0
- and r6, r6, r0
- cntlzd r31, r6 C common twos
- subfic r31, r31, 63
-
- neg r6, r5
- and r6, r6, r5
- cntlzd r8, r6
- subfic r8, r8, 63
- srd r5, r5, r8
- mr r30, r5 C v0 saved
-
- cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
- blt L(bmod)
- CALL( mpn_mod_1)
- b L(reduced)
-L(bmod):
- li r6, 0
- CALL( mpn_modexact_1c_odd)
-L(reduced):
-
-define(`cnt', `r9')dnl
-
- neg. r6, r3
- and r6, r6, r3
- cntlzd cnt, r6
- li r12, 63
- bne L(mid)
- b L(end)
-
- ALIGN(16)
-L(top): isel r30, r3, r30, 29 C y = min(x,y)
- isel r3, r10, r11, 29 C x = |y - x|
-L(mid): subf cnt, cnt, r12 C cnt = 63-cnt
- srd r3, r3, cnt
- subf r10, r3, r30 C r10 = y - x
- subf r11, r30, r3 C r11 = x - y
- cmpld cr7, r30, r3
- and r8, r11, r10 C isolate lsb
- cntlzd cnt, r8
- bne cr7, L(top)
-
-L(end): sld r3, r30, r31
-
- addi r1, r1, 128
- ld r0, 16(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- mtlr r0
- blr
-EPILOGUE()
diff --git a/mpn/powerpc64/mode64/p9/gcd_1.asm b/mpn/powerpc64/mode64/p9/gcd_1.asm
deleted file mode 100644
index 286e48dc5..000000000
--- a/mpn/powerpc64/mode64/p9/gcd_1.asm
+++ /dev/null
@@ -1,101 +0,0 @@
-dnl PowerPC-64 mpn_gcd_1.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C POWER3/PPC630 -
-C POWER4/PPC970 -
-C POWER5 -
-C POWER6 -
-C POWER7 -
-C POWER8 -
-C POWER9 5.75
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up', `r3')
-define(`n', `r4')
-define(`v0', `r5')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
- mflr r0
- std r30, -16(r1)
- std r31, -8(r1)
- std r0, 16(r1)
- stdu r1, -128(r1)
-
- ld r7, 0(up) C U low limb
- or r0, r5, r7 C x | y
- cnttzd r31, r0 C common twos
- cnttzd r8, r5
- srd r5, r5, r8
- mr r30, r5 C v0 saved
-
- cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
- blt L(bmod)
- CALL( mpn_mod_1)
- b L(reduced)
-L(bmod):
- li r6, 0
- CALL( mpn_modexact_1c_odd)
-L(reduced):
-
-define(`cnt', `r9')dnl
-
- cmpdi r3, 0
- cnttzd cnt, r3
- bne L(mid)
- b L(end)
-
- ALIGN(16)
-L(top): isel r30, r3, r30, 29 C y = min(x,y)
- isel r3, r10, r11, 29 C x = |y - x|
-L(mid): srd r3, r3, cnt
- subf r10, r3, r30 C r10 = y - x
- subf r11, r30, r3 C r11 = x - y
- cmpld cr7, r30, r3
- cnttzd cnt, r10
- bne cr7, L(top)
-
-L(end): sld r3, r30, r31
-
- addi r1, r1, 128
- ld r0, 16(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- mtlr r0
- blr
-EPILOGUE()
diff --git a/mpn/sparc64/gcd_1.asm b/mpn/sparc64/gcd_1.asm
deleted file mode 100644
index e4d8de6a2..000000000
--- a/mpn/sparc64/gcd_1.asm
+++ /dev/null
@@ -1,135 +0,0 @@
-dnl SPARC64 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for SPARC by Torbjörn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C UltraSPARC 1&2: 5.1
-C UltraSPARC 3: 5.0
-C UltraSPARC T1: 11.4
-C UltraSPARC T3: 10
-C UltraSPARC T4: 6
-C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
- RODATA
- TYPE(ctz_table,object)
-ctz_table:
- .byte MAXSHIFT
-forloop(i,1,MASK,
-` .byte m4_count_trailing_zeros(i)
-')
- SIZE(ctz_table,.-ctz_table)
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 14)
-
-C INPUT PARAMETERS
-define(`up', `%i0')
-define(`n', `%i1')
-define(`v0', `%i2')
-
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_gcd_1)
- save %sp, -192, %sp
- ldx [up+0], %g1 C U low limb
- mov -1, %i4
- or v0, %g1, %g2 C x | y
-
-L(twos):
- inc %i4
- andcc %g2, 1, %g0
- bz,a %xcc, L(twos)
- srlx %g2, 1, %g2
-
-L(divide_strip_y):
- andcc v0, 1, %g0
- bz,a %xcc, L(divide_strip_y)
- srlx v0, 1, v0
-
- cmp n, 1 C if n > 1 we need
- bnz %xcc, L(bmod) C to call bmod_1
- nop
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- srlx %g1, BMOD_THRES_LOG2, %g2
- cmp %g2, v0
- bleu %xcc, L(noreduce)
- mov %g1, %o0
-
-L(bmod):
- mov up, %o0
- mov n, %o1
- mov v0, %o2
- call mpn_modexact_1c_odd
- mov 0, %o3
-
-L(noreduce):
-
- LEA64(ctz_table, i5, g4)
-
- cmp %o0, 0
- bnz %xcc, L(mid)
- and %o0, MASK, %g3 C
-
- return %i7+8
- sllx %o2, %o4, %o0 C CAUTION: v0 alias for o2
-
- ALIGN(16)
-L(top): movcc %xcc, %l4, v0 C v = min(u,v)
- movcc %xcc, %l2, %o0 C u = |v - u]
-L(mid): ldub [%i5+%g3], %g5 C
- brz,a,pn %g3, L(shift_alot) C
- srlx %o0, MAXSHIFT, %o0
- srlx %o0, %g5, %l4 C new u, odd
- subcc v0, %l4, %l2 C v - u, set flags for branch and movcc
- sub %l4, v0, %o0 C u - v
- bnz,pt %xcc, L(top) C
- and %l2, MASK, %g3 C extract low MAXSHIFT bits from (v-u)
-
- return %i7+8
- sllx %o2, %o4, %o0 C CAUTION: v0 alias for o2
-
-L(shift_alot):
- b L(mid)
- and %o0, MASK, %g3 C
-EPILOGUE()
diff --git a/mpn/x86/k6/gcd_1.asm b/mpn/x86/k6/gcd_1.asm
deleted file mode 100644
index a45774d37..000000000
--- a/mpn/x86/k6/gcd_1.asm
+++ /dev/null
@@ -1,359 +0,0 @@
-dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C K6: 9.5 cycles/bit (approx) 1x1 gcd
-C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
-
-
-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
-C
-C This code is nothing very special, but offers a speedup over what gcc 2.95
-C can do with mpn/generic/gcd_1.c.
-C
-C Future:
-C
-C Using a lookup table to count trailing zeros seems a touch quicker, but
-C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
-C it too.
-
-
-dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
-dnl bigger than y, then a division x%y is done to reduce it.
-dnl
-dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
-dnl there should be an advantage in the divl at about 4 or 5 bits, which is
-dnl what's found.
-
-deflit(DIV_THRESHOLD, 5)
-
-
-defframe(PARAM_LIMB, 12)
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
-
- TEXT
- ALIGN(16)
-
-PROLOGUE(mpn_gcd_1)
-deflit(`FRAME',0)
-
- ASSERT(ne, `cmpl $0, PARAM_LIMB')
- ASSERT(ae, `cmpl $1, PARAM_SIZE')
-
-
- movl PARAM_SRC, %eax
- pushl %ebx FRAME_pushl()
-
- movl PARAM_LIMB, %edx
- movl $-1, %ecx
-
- movl (%eax), %ebx C src low limb
-
- movl %ebx, %eax C src low limb
- orl %edx, %ebx
-
-L(common_twos):
- shrl %ebx
- incl %ecx
-
- jnc L(common_twos) C 1/4 chance on random data
- shrl %cl, %edx C y
-
- cmpl $1, PARAM_SIZE
- ja L(size_two_or_more)
-
-
- ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
-
- shrl %cl, %eax C x
-
-
- C Swap if necessary to make x>=y. Measures a touch quicker as a
- C jump than a branch free calculation.
- C
- C eax x
- C ebx
- C ecx common twos
- C edx y
-
- movl %eax, %ebx
- cmpl %eax, %edx
-
- jb L(noswap)
- movl %edx, %eax
-
- movl %ebx, %edx
- movl %eax, %ebx
-L(noswap):
-
-
- C See if it's worth reducing x with a divl.
- C
- C eax x
- C ebx x
- C ecx common twos
- C edx y
-
- shrl $DIV_THRESHOLD, %ebx
-
- cmpl %ebx, %edx
- ja L(nodiv)
-
-
- C Reduce x to x%y.
- C
- C eax x
- C ebx
- C ecx common twos
- C edx y
-
- movl %edx, %ebx
- xorl %edx, %edx
-
- divl %ebx
-
- orl %edx, %edx C y
- nop C code alignment
-
- movl %ebx, %eax C x
- jz L(done_shll)
-L(nodiv):
-
-
- C eax x
- C ebx
- C ecx common twos
- C edx y
- C esi
- C edi
- C ebp
-
-L(strip_y):
- shrl %edx
- jnc L(strip_y)
-
- leal 1(%edx,%edx), %edx
- movl %ecx, %ebx C common twos
-
- leal 1(%eax), %ecx
- jmp L(strip_x_and)
-
-
-C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
-C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
-C only 1/4.
-C
-C A second computed %cl shift was tried, but that measured a touch slower
-C than branching back.
-C
-C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
-C measured about 1 cycle/bit slower.
-
- C eax x
- C ebx common twos
- C ecx scratch
- C edx y
-
- ALIGN(4)
-L(swap):
- addl %eax, %edx C x-y+y = x
- negl %eax C -(x-y) = y-x
-
-L(strip_x):
- shrl %eax C odd-odd = even, so always one to strip
- ASSERT(nz)
-
-L(strip_x_leal):
- leal 1(%eax), %ecx
-
-L(strip_x_and):
- andl $1, %ecx C (x^1)&1
-
- shrl %cl, %eax C shift if x even
-
- testb $1, %al
- jz L(strip_x)
-
- ASSERT(nz,`testl $1, %eax') C x, y odd
- ASSERT(nz,`testl $1, %edx')
-
- subl %edx, %eax
- jb L(swap)
- ja L(strip_x)
-
-
- movl %edx, %eax
- movl %ebx, %ecx
-
-L(done_shll):
- shll %cl, %eax
- popl %ebx
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C Two or more limbs.
-C
-C x={src,size} is reduced modulo y using either a plain mod_1 style
-C remainder, or a modexact_1 style exact division.
-
-deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
-
- ALIGN(8)
-L(size_two_or_more):
- C eax
- C ebx
- C ecx common twos
- C edx y, without common twos
- C esi
- C edi
- C ebp
-
-deflit(FRAME_TWO_OR_MORE, FRAME)
-
- pushl %edi defframe_pushl(SAVE_EDI)
- movl PARAM_SRC, %ebx
-
-L(y_twos):
- shrl %edx
- jnc L(y_twos)
-
- movl %ecx, %edi C common twos
- movl PARAM_SIZE, %ecx
-
- pushl %esi defframe_pushl(SAVE_ESI)
- leal 1(%edx,%edx), %esi C y (odd)
-
- movl -4(%ebx,%ecx,4), %eax C src high limb
-
- cmpl %edx, %eax C carry if high<divisor
-
- sbbl %edx, %edx C -1 if high<divisor
-
- addl %edx, %ecx C skip one limb if high<divisor
- andl %eax, %edx
-
- cmpl $MODEXACT_THRESHOLD, %ecx
- jae L(modexact)
-
-
-L(divide_top):
- C eax scratch (quotient)
- C ebx src
- C ecx counter, size-1 to 1
- C edx carry (remainder)
- C esi divisor (odd)
- C edi
- C ebp
-
- movl -4(%ebx,%ecx,4), %eax
- divl %esi
- loop L(divide_top)
-
-
- movl %edx, %eax C x
- movl %esi, %edx C y (odd)
-
- movl %edi, %ebx C common twos
- popl %esi
-
- popl %edi
- leal 1(%eax), %ecx
-
- orl %eax, %eax
- jnz L(strip_x_and)
-
-
- movl %ebx, %ecx
- movl %edx, %eax
-
- shll %cl, %eax
- popl %ebx
-
- ret
-
-
- ALIGN(8)
-L(modexact):
- C eax
- C ebx src ptr
- C ecx size or size-1
- C edx
- C esi y odd
- C edi common twos
- C ebp
-
- movl PARAM_SIZE, %eax
- pushl %esi FRAME_pushl()
-
- pushl %eax FRAME_pushl()
-
- pushl %ebx FRAME_pushl()
-
-ifdef(`PIC_WITH_EBX',`
- nop C code alignment
- call L(movl_eip_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
-')
- CALL( mpn_modexact_1_odd)
-
- movl %esi, %edx C y odd
- movl SAVE_ESI, %esi
-
- movl %edi, %ebx C common twos
- movl SAVE_EDI, %edi
-
- addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
- orl %eax, %eax
-
- leal 1(%eax), %ecx
- jnz L(strip_x_and)
-
-
- movl %ebx, %ecx
- movl %edx, %eax
-
- shll %cl, %eax
- popl %ebx
-
- ret
-
-
-ifdef(`PIC_WITH_EBX',`
-L(movl_eip_ebx):
- movl (%esp), %ebx
- ret_internal
-')
-
-EPILOGUE()
diff --git a/mpn/x86/k7/gcd_1.asm b/mpn/x86/k7/gcd_1.asm
deleted file mode 100644
index 479d2c2b7..000000000
--- a/mpn/x86/k7/gcd_1.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl x86 mpn_gcd_1 optimised for AMD K7.
-
-dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
-dnl Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K7 5.31
-C AMD K8,K9 5.33
-C AMD K10 5.30
-C AMD bd1 ?
-C AMD bobcat 7.02
-C Intel P4-2 10.1
-C Intel P4-3/4 10.0
-C Intel P6/13 5.88
-C Intel core2 6.26
-C Intel NHM 6.83
-C Intel SBR 8.50
-C Intel atom 8.90
-C VIA nano ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C TODO
-C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
-C * Stream things better through registers, avoiding some copying.
-C * For ELF, avoid putting GOT base in both ebx and esi. Needs special
-C LEA/LEAL or else discrete code here.
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 6)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-DEF_OBJECT(ctz_table,64)
- .byte MAXSHIFT
-forloop(i,1,MASK,
-` .byte m4_count_trailing_zeros(i)
-')
-END_OBJECT(ctz_table)
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`DIV_THRES_LOG2', 7)
-
-
-define(`up', `%edi')
-define(`n', `%esi')
-define(`v0', `%edx')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push %edi
- push %esi
-
- mov 12(%esp), up
- mov 16(%esp), n
- mov 20(%esp), v0
-
- mov (up), %eax C U low limb
- or v0, %eax C x | y
- mov $-1, %ecx
-
-L(twos):
- inc %ecx
- shr %eax
- jnc L(twos)
-
- shr %cl, v0
- mov %ecx, %eax C common twos
-
-L(divide_strip_y):
- shr v0
- jnc L(divide_strip_y)
- adc v0, v0
-
- push %eax
- push v0
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with div if u0 >> v0.
- mov (up), %ecx
- mov %ecx, %eax
- shr $DIV_THRES_LOG2, %ecx
- cmp %ecx, v0
- ja L(reduced)
-
- mov v0, %esi
- xor %edx, %edx
- div %esi
- mov %edx, %eax
- jmp L(reduced)
-
-L(reduce_nby1):
-ifdef(`PIC_WITH_EBX',`dnl
- push %ebx
- add $-4, %esp
- call L(movl_eip_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
-')
- push v0 C param 3
- push n C param 2
- push up C param 1
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
- CALL( mpn_mod_1)
- jmp L(called)
-L(bmod):
- CALL( mpn_modexact_1_odd)
-
-L(called):
-ifdef(`PIC_WITH_EBX',`dnl
- add $16, %esp C deallocate params
- pop %ebx
-',`
- add $12, %esp C deallocate params
-')
-L(reduced):
- pop %edx
-
- LEAL( ctz_table, %esi)
- test %eax, %eax
- mov %eax, %ecx
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K8 BC P4 NHM SBR
-L(top): cmovc( %ecx, %eax) C if x-y < 0 0
- cmovc( %edi, %edx) C use x,y-x 0
-L(mid): and $MASK, %ecx C 0
- movzbl (%esi,%ecx), %ecx C 1
- jz L(shift_alot) C 1
- shr %cl, %eax C 3
- mov %eax, %edi C 4
- mov %edx, %ecx C 3
- sub %eax, %ecx C 4
- sub %edx, %eax C 4
- jnz L(top) C 5
-
-L(end): pop %ecx
- mov %edx, %eax
- shl %cl, %eax
- pop %esi
- pop %edi
- ret
-
-L(shift_alot):
- shr $MAXSHIFT, %eax
- mov %eax, %ecx
- jmp L(mid)
-
-ifdef(`PIC_WITH_EBX',`dnl
-L(movl_eip_ebx):
- mov (%esp), %ebx
- ret
-')
-EPILOGUE()
-ASM_END()
diff --git a/mpn/x86/p6/gcd_1.asm b/mpn/x86/p6/gcd_1.asm
deleted file mode 100644
index eafbf4a79..000000000
--- a/mpn/x86/p6/gcd_1.asm
+++ /dev/null
@@ -1,161 +0,0 @@
-dnl x86 mpn_gcd_1 optimised for processors with fast BSF.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
-dnl Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K7 7.80
-C AMD K8,K9 7.79
-C AMD K10 4.08
-C AMD bd1 ?
-C AMD bobcat 7.82
-C Intel P4-2 14.9
-C Intel P4-3/4 14.0
-C Intel P6/13 5.09
-C Intel core2 4.22
-C Intel NHM 5.00
-C Intel SBR 5.00
-C Intel atom 17.1
-C VIA nano ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-
-define(`up', `%edi')
-define(`n', `%esi')
-define(`v0', `%edx')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push %edi
- push %esi
-
- mov 12(%esp), up
- mov 16(%esp), n
- mov 20(%esp), v0
-
- mov (up), %eax C U low limb
- or v0, %eax
- bsf %eax, %eax C min(ctz(u0),ctz(v0))
-
- bsf v0, %ecx
- shr %cl, v0
-
- push %eax C preserve common twos over call
- push v0 C preserve v0 argument over call
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %ecx
- mov %ecx, %eax
- shr $BMOD_THRES_LOG2, %ecx
- cmp %ecx, v0
- ja L(reduced)
- jmp L(bmod)
-
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-ifdef(`PIC_WITH_EBX',`dnl
- push %ebx
- add $-4, %esp
- call L(movl_eip_to_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
-')
- push v0 C param 3
- push n C param 2
- push up C param 1
- CALL( mpn_mod_1)
- jmp L(called)
-
-L(bmod):
-ifdef(`PIC_WITH_EBX',`dnl
- push %ebx
- add $-4, %esp
- call L(movl_eip_to_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
-')
- push v0 C param 3
- push n C param 2
- push up C param 1
- CALL( mpn_modexact_1_odd)
-
-L(called):
-ifdef(`PIC_WITH_EBX',`dnl
- add $16, %esp C deallocate params
- pop %ebx
-',`
- add $12, %esp C deallocate params
-')
-L(reduced):
- pop %edx
-
- bsf %eax, %ecx
-C test %eax, %eax C FIXME: does this lower latency?
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K10 BD C2 NHM SBR
-L(top): cmovc( %esi, %eax) C if x-y < 0 0,3 0,3 0,6 0,5 0,5
- cmovc( %edi, %edx) C use x,y-x 0,3 0,3 2,8 1,7 1,7
-L(mid): shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8
- mov %edx, %esi C 1 1 4 3 3
- sub %eax, %esi C 2 2 5 4 4
- bsf %esi, %ecx C 3 3 6 5 5
- mov %eax, %edi C 2 2 3 3 4
- sub %edx, %eax C 2 2 4 3 4
- jnz L(top) C
-
-L(end): pop %ecx
- mov %edx, %eax
- shl %cl, %eax
-
- pop %esi
- pop %edi
- ret
-
-ifdef(`PIC_WITH_EBX',`dnl
-L(movl_eip_to_ebx):
- mov (%esp), %ebx
- ret
-')
-EPILOGUE()
diff --git a/mpn/x86_64/bd1/gcd_1.asm b/mpn/x86_64/bd1/gcd_1.asm
deleted file mode 100644
index 3d8e5c7ab..000000000
--- a/mpn/x86_64/bd1/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/mpn/x86_64/bd2/gcd_1.asm b/mpn/x86_64/bd2/gcd_1.asm
deleted file mode 100644
index 42b71a7bc..000000000
--- a/mpn/x86_64/bd2/gcd_1.asm
+++ /dev/null
@@ -1,164 +0,0 @@
-dnl AMD64 mpn_gcd_1 optimised for AMD BD2-BD4, Zen.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
-dnl Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bd1 ?
-C AMD bd2 ?
-C AMD bd3 ?
-C AMD bd4 3.65
-C AMD bt1 ?
-C AMD bt2 ?
-C AMD zn1 3.5
-C AMD zn2 3.8
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel SKL ?
-C Intel atom ?
-C Intel SLM ?
-C Intel GLM ?
-C Intel GLM+ ?
-C VIA nano ?
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop for specific CPUs.
-C * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up', `%rdi')
-define(`n', `%rsi')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function. It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
- define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- FUNC_ENTRY(3)
- mov (up), %rax C U low limb
- or v0, %rax C x | y
- bsf %rax, %rax C min(ctz(u0),ctz(v0))
-
- bsf v0, %rcx
- shr R8(%rcx), v0
-
- push %rax C preserve common twos over call
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %r8
- mov %r8, %rax
- shr $BMOD_THRES_LOG2, %r8
- cmp %r8, v0
- ja L(reduced)
-
-L(bmod):
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- ASSERT(nz, `test $15, %rsp')
- CALL( mpn_modexact_1_odd)
-
-L(called):
- add $STACK_ALLOC, %rsp
- pop v0
-
-L(reduced):
- bsf %rax, %rcx
-C test %rax, %rax C FIXME: does this lower latency?
- jnz L(mid)
- jmp L(end)
-
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- ASSERT(nz, `test $15, %rsp')
- CALL( mpn_mod_1)
- jmp L(called)
-
- ALIGN(16) C K10 BD1 BD2 ZEN CNR NHM SBR
-L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,3 0,3 0,6 0,5 0,5
- cmovc %r9, v0 C use x,y-x 0,3 0,3 0,3 0,3 2,8 1,7 1,7
-L(mid): shr R8(%rcx), %rax C 1,7 1,6 1,5 1,4 2,8 2,8 2,8
- mov v0, %r10 C 1 1 1 1 4 3 3
- sub %rax, %r10 C 2 2 2 1 5 4 4
- rep;bsf %r10, %rcx C tzcnt! 3 3 3 2 6 5 5
- mov %rax, %r9 C 2 2 2 2 3 3 4
- sub v0, %rax C 2 2 2 2 4 3 4
- jnz L(top) C
-
-L(end): pop %rcx C common twos
- mov v0, %rax
- shl R8(%rcx), %rax
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/mpn/x86_64/bt2/gcd_1.asm b/mpn/x86_64/bt2/gcd_1.asm
deleted file mode 100644
index 133d98363..000000000
--- a/mpn/x86_64/bt2/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/bd2/gcd_1.asm')
diff --git a/mpn/x86_64/core2/gcd_1.asm b/mpn/x86_64/core2/gcd_1.asm
deleted file mode 100644
index 52425a669..000000000
--- a/mpn/x86_64/core2/gcd_1.asm
+++ /dev/null
@@ -1,151 +0,0 @@
-dnl AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
-dnl Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K8,K9 8.50
-C AMD K10 4.30
-C AMD bd1 5.00
-C AMD bobcat 10.0
-C Intel P4 18.6
-C Intel core2 3.83
-C Intel NHM 5.17
-C Intel SBR 4.69
-C Intel atom 17.0
-C VIA nano 5.44
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop for specific CPUs.
-C * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up', `%rdi')
-define(`n', `%rsi')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function. It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
- define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- FUNC_ENTRY(3)
- mov (up), %rax C U low limb
- or v0, %rax C x | y
- bsf %rax, %rax C min(ctz(u0),ctz(v0))
-
- bsf v0, %rcx
- shr R8(%rcx), v0
-
- push %rax C preserve common twos over call
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %r8
- mov %r8, %rax
- shr $BMOD_THRES_LOG2, %r8
- cmp %r8, v0
- ja L(reduced)
-
-L(bmod):
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- ASSERT(nz, `test $15, %rsp')
- CALL( mpn_modexact_1_odd)
-
-L(called):
- add $STACK_ALLOC, %rsp
- pop v0
-
-L(reduced):
- bsf %rax, %rcx
-C test %rax, %rax C FIXME: does this lower latency?
- jnz L(mid)
- jmp L(end)
-
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- ASSERT(nz, `test $15, %rsp')
- CALL( mpn_mod_1)
- jmp L(called)
-
- ALIGN(16) C K10 BD C2 NHM SBR
-L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5
- cmovc %r9, v0 C use x,y-x 0,3 0,3 2,8 1,7 1,7
-L(mid): shr R8(%rcx), %rax C 1,7 1,6 2,8 2,8 2,8
- mov v0, %r10 C 1 1 4 3 3
- sub %rax, %r10 C 2 2 5 4 4
- bsf %r10, %rcx C 3 3 6 5 5
- mov %rax, %r9 C 2 2 3 3 4
- sub v0, %rax C 2 2 4 3 4
- jnz L(top) C
-
-L(end): pop %rcx C common twos
- mov v0, %rax
- shl R8(%rcx), %rax
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/mpn/x86_64/gcd_1.asm b/mpn/x86_64/gcd_1.asm
deleted file mode 100644
index 65eba6960..000000000
--- a/mpn/x86_64/gcd_1.asm
+++ /dev/null
@@ -1,170 +0,0 @@
-dnl AMD64 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
-dnl Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K8,K9 5.21 (4.95)
-C AMD K10 5.15 (5.00)
-C AMD bd1 5.42 (5.14)
-C AMD bobcat 6.71 (6.56)
-C Intel P4 13.5 (12.75)
-C Intel core2 6.20 (6.16)
-C Intel NHM 6.49 (6.25)
-C Intel SBR 7.75 (7.57)
-C Intel atom 8.77 (8.54)
-C VIA nano 6.60 (6.20)
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-DEF_OBJECT(ctz_table,64)
- .byte MAXSHIFT
-forloop(i,1,MASK,
-` .byte m4_count_trailing_zeros(i)
-')
-END_OBJECT(ctz_table)
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 8)
-
-C INPUT PARAMETERS
-define(`up', `%rdi')
-define(`n', `%rsi')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- FUNC_ENTRY(3)
- mov (up), %rax C U low limb
- mov $-1, R32(%rcx)
- or v0, %rax C x | y
-
-L(twos):
- inc R32(%rcx)
- shr %rax
- jnc L(twos)
-
- shr R8(%rcx), v0
- push %rcx C common twos
-
-L(divide_strip_y):
- shr v0
- jnc L(divide_strip_y)
- adc v0, v0
-
- cmp $1, n
-ifelse(BMOD_1_TO_MOD_1_THRESHOLD, MP_SIZE_T_MAX,`
- jnz L(bmod)
-',`
- jnz L(reduce_nby1)
-')
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %r8
- mov %r8, %rax
- shr $BMOD_THRES_LOG2, %r8
- cmp %r8, v0
- ja L(reduced)
-
-L(bmod):
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- ASSERT(nz, `test $15, %rsp')
- CALL( mpn_modexact_1_odd)
-
-L(called):
- add $STACK_ALLOC, %rsp
- pop v0
-
-L(reduced):
- LEA( ctz_table, %rsi)
- test %rax, %rax
- mov %rax, %rcx
- jnz L(mid)
- jmp L(end)
-
-ifelse(BMOD_1_TO_MOD_1_THRESHOLD, `MP_SIZE_T_MAX',,`
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- ASSERT(nz, `test $15, %rsp')
- CALL( mpn_mod_1)
- jmp L(called)
-')
- ALIGN(16) C K8 BC P4 NHM SBR
-L(top): cmovc %rcx, %rax C if x-y < 0 0
- cmovc %rdi, v0 C use x,y-x 0
-L(mid): and $MASK, R32(%rcx) C 0
- movzbl (%rsi,%rcx), R32(%rcx) C 1
- jz L(shift_alot) C 1
- shr R8(%rcx), %rax C 3
- mov %rax, %rdi C 4
- mov v0, %rcx C 3
- sub %rax, %rcx C 4
- sub v0, %rax C 4
- jnz L(top) C
-
-L(end): pop %rcx
- mov v0, %rax
- shl R8(%rcx), %rax
- FUNC_EXIT()
- ret
-
-L(shift_alot):
- shr $MAXSHIFT, %rax
- mov %rax, %rcx
- jmp L(mid)
-EPILOGUE()
diff --git a/mpn/x86_64/k10/gcd_1.asm b/mpn/x86_64/k10/gcd_1.asm
deleted file mode 100644
index 3d8e5c7ab..000000000
--- a/mpn/x86_64/k10/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/mpn/x86_64/nano/gcd_1.asm b/mpn/x86_64/nano/gcd_1.asm
deleted file mode 100644
index 3d8e5c7ab..000000000
--- a/mpn/x86_64/nano/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/mpn/x86_64/zen/gcd_1.asm b/mpn/x86_64/zen/gcd_1.asm
deleted file mode 100644
index 133d98363..000000000
--- a/mpn/x86_64/zen/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/bd2/gcd_1.asm')