diff options
Diffstat (limited to 'mpn/x86')
-rw-r--r-- | mpn/x86/atom/lshift.asm | 4 | ||||
-rw-r--r-- | mpn/x86/atom/sse2/mul_1.asm | 2 | ||||
-rw-r--r-- | mpn/x86/bdiv_dbm1c.asm | 4 | ||||
-rw-r--r-- | mpn/x86/bdiv_q_1.asm | 2 | ||||
-rw-r--r-- | mpn/x86/k7/addlsh1_n.asm | 6 | ||||
-rw-r--r-- | mpn/x86/k7/invert_limb.asm | 2 | ||||
-rw-r--r-- | mpn/x86/k7/sublsh1_n.asm | 8 | ||||
-rw-r--r-- | mpn/x86/p6/bdiv_q_1.asm | 4 | ||||
-rw-r--r-- | mpn/x86/pentium/bdiv_q_1.asm | 2 |
9 files changed, 17 insertions, 17 deletions
diff --git a/mpn/x86/atom/lshift.asm b/mpn/x86/atom/lshift.asm index d8cb8b505..1005cce59 100644 --- a/mpn/x86/atom/lshift.asm +++ b/mpn/x86/atom/lshift.asm @@ -160,7 +160,7 @@ deflit(`FRAME',4) shr $2, %eax C (size + 3) / 4 and $3, %edx C (size - 1) % 4 jz L(goloop) C jmp if size == 1 (mod 4) - shr %edx + shr %edx jnc L(odd) C jum if size == 3 (mod 4) add %ecx, %ecx @@ -173,7 +173,7 @@ deflit(`FRAME',4) jnz L(goloop) C jump if size == 0 (mod 4) L(odd): lea -8(up), up lea -8(rp), rp - jmp L(sentry) C reached if size == 2 or 3 (mod 4) + jmp L(sentry) C reached if size == 2 or 3 (mod 4) L(sloop): adc %ecx, %ecx diff --git a/mpn/x86/atom/sse2/mul_1.asm b/mpn/x86/atom/sse2/mul_1.asm index dd9b95366..5cd86caec 100644 --- a/mpn/x86/atom/sse2/mul_1.asm +++ b/mpn/x86/atom/sse2/mul_1.asm @@ -62,7 +62,7 @@ EPILOGUE() PROLOGUE(mpn_mul_1) pxor %mm6, %mm6 L(ent): push %esi FRAME_pushl() - mov PARAM_SRC, up + mov PARAM_SRC, up mov PARAM_SIZE, %eax C size movd PARAM_MUL, %mm7 movd (up), %mm0 diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm index 201ef173d..ac9faf270 100644 --- a/mpn/x86/bdiv_dbm1c.asm +++ b/mpn/x86/bdiv_dbm1c.asm @@ -24,10 +24,10 @@ C P5 C P6 model 0-8,10-12) C P6 model 9 (Banias) C P6 model 13 (Dothan) 5.1 -C P4 model 0 (Willamette) +C P4 model 0 (Willamette) C P4 model 1 (?) C P4 model 2 (Northwood) 13.67 -C P4 model 3 (Prescott) +C P4 model 3 (Prescott) C P4 model 4 (Nocona) C Intel Atom C AMD K6 diff --git a/mpn/x86/bdiv_q_1.asm b/mpn/x86/bdiv_q_1.asm index 2528d01f7..7f344ab57 100644 --- a/mpn/x86/bdiv_q_1.asm +++ b/mpn/x86/bdiv_q_1.asm @@ -30,7 +30,7 @@ C K6 14.0 C K7 12.0 C P4 42.0 -MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) defframe(PARAM_SHIFT, 24) defframe(PARAM_INVERSE,20) diff --git a/mpn/x86/k7/addlsh1_n.asm b/mpn/x86/k7/addlsh1_n.asm index e5163b676..05df4a740 100644 --- a/mpn/x86/k7/addlsh1_n.asm +++ b/mpn/x86/k7/addlsh1_n.asm @@ -44,14 +44,14 @@ C AMD K8 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, C that means we need an initial magic multiply. -C +C C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We C cannot do rsblsh1_n since we feed carry from the shift blocks to the C add/subtract blocks, which is right for addition but reversed for C subtraction. We could perhaps do sublsh1_n, with some extra move insns, C without losing any time, since we're not issue limited but carry recurrency C latency. -C +C C Breaking carry recurrency might be a good idea. We would then need separate C registers for the shift carry and add/subtract carry, which in turn would C force is to 2*2-way unrolling. @@ -120,7 +120,7 @@ ifdef(`CPU_P6',` L(exact): incl VAR_COUNT jz L(end) - + ALIGN(16) L(top): ifdef(`CPU_P6',` diff --git a/mpn/x86/k7/invert_limb.asm b/mpn/x86/k7/invert_limb.asm index da6f28397..435fa96d0 100644 --- a/mpn/x86/k7/invert_limb.asm +++ b/mpn/x86/k7/invert_limb.asm @@ -60,7 +60,7 @@ ifdef(`DARWIN',` PROLOGUE(mpn_invert_limb) deflit(`FRAME', 0) mov PARAM_DIVISOR, %eax - C Avoid push/pop on k7. + C Avoid push/pop on k7. sub $8, %esp FRAME_subl_esp(8) mov %ebx, (%esp) mov %edi, 4(%esp) diff --git a/mpn/x86/k7/sublsh1_n.asm b/mpn/x86/k7/sublsh1_n.asm index 41993f99a..965348586 100644 --- a/mpn/x86/k7/sublsh1_n.asm +++ b/mpn/x86/k7/sublsh1_n.asm @@ -30,7 +30,7 @@ C cycles/limb C P5 C P6 model 0-8,10-12 C P6 model 9 (Banias) -C P6 model 13 (Dothan) +C P6 model 13 (Dothan) C P4 model 0 (Willamette) C P4 model 1 (?) C P4 model 2 (Northwood) @@ -38,12 +38,12 @@ C P4 model 3 (Prescott) C P4 model 4 (Nocona) C Intel Atom 6.75 C AMD K6 -C AMD K7 +C AMD K7 C AMD K8 C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 C processors. It uses 2*4-way unrolling, for good reasons. -C +C C Breaking carry recurrency might be a good idea. We would then need separate C registers for the shift carry and add/subtract carry, which in turn would C force is to 2*2-way unrolling. @@ -114,7 +114,7 @@ ifdef(`CPU_P6',` adc %ebp, %ebp rcr %edx C restore 1st saved carry bit - + sbb %eax, (rp) sbb %ebx, 4(rp) sbb %ecx, 8(rp) diff --git a/mpn/x86/p6/bdiv_q_1.asm b/mpn/x86/p6/bdiv_q_1.asm index 3a8733a0d..0ffbc78e4 100644 --- a/mpn/x86/p6/bdiv_q_1.asm +++ b/mpn/x86/p6/bdiv_q_1.asm @@ -25,7 +25,7 @@ include(`../config.m4') C odd even divisor C P6: 10.0 12.0 cycles/limb -C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) C The odd case is basically the same as mpn_modexact_1_odd, just with an C extra store, and it runs at the same 10 cycles which is the dependent @@ -269,7 +269,7 @@ ifdef(`PIC',` imull %edx, %eax C inv*inv*d subl %eax, %ebp C inv = 2*inv - inv*inv*d - + jmp L(common) EPILOGUE() diff --git a/mpn/x86/pentium/bdiv_q_1.asm b/mpn/x86/pentium/bdiv_q_1.asm index 965173d1c..7e84fc817 100644 --- a/mpn/x86/pentium/bdiv_q_1.asm +++ b/mpn/x86/pentium/bdiv_q_1.asm @@ -27,7 +27,7 @@ C odd even C P54: 24.5 30.5 cycles/limb C P55: 23.0 28.0 -MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as C expected. On P54 in the even case the shrdl pairing nonsense (see |