diff options
43 files changed, 298 insertions, 216 deletions
diff --git a/mpn/x86_64/addaddmul_1msb0.asm b/mpn/x86_64/addaddmul_1msb0.asm index 89e7bed98..5b2b66847 100644 --- a/mpn/x86_64/addaddmul_1msb0.asm +++ b/mpn/x86_64/addaddmul_1msb0.asm @@ -20,9 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8: 2.167 -C P4: 12.0 -C P6-15: 4.0 +C AMD K8,K9 2.167 +C AMD K10 2.167 +C Intel P4 12.0 +C Intel core2 4.0 +C Intel corei ? +C Intel atom ? +C VIA nano ? C TODO C * Perhaps handle various n mod 3 sizes better. The code now is too large. diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm index e762113f6..a2892f858 100644 --- a/mpn/x86_64/addmul_2.asm +++ b/mpn/x86_64/addmul_2.asm @@ -21,11 +21,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 2.375 -C K10: 2.375 -C P4: ? -C P6 core2: 4.45 -C P6 corei7: 4.35 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 ? +C Intel core2 4.45 +C Intel corei 4.35 +C Intel atom ? +C VIA nano 4.5 C This code is the result of running a code generation and optimization tool C suite written by David Harvey and Torbjorn Granlund. diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm index 75fd009c6..3f53e6935 100644 --- a/mpn/x86_64/aorrlsh1_n.asm +++ b/mpn/x86_64/aorrlsh1_n.asm @@ -22,12 +22,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2 -C K10: 2 -C P4: 13 -C P6 core2: 3.45 -C P6 corei7: 3.45 -C P6 atom: ? +C AMD K8,K9 2 +C AMD K10 2 +C Intel P4 13 +C Intel core2 3.45 +C Intel corei 3.45 +C Intel atom ? +C VIA nano ? C Sometimes speed degenerates, supposedly related to that some operand diff --git a/mpn/x86_64/aorrlsh2_n.asm b/mpn/x86_64/aorrlsh2_n.asm index 16cecef8d..11e586453 100644 --- a/mpn/x86_64/aorrlsh2_n.asm +++ b/mpn/x86_64/aorrlsh2_n.asm @@ -23,12 +23,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2 -C K10: 2 -C P4: ? -C P6 core2: 3 -C P6 corei7: 2.75 -C P6 atom: ? +C AMD K8,K9 2 +C AMD K10 2 +C Intel P4 ? +C Intel core2 3 +C Intel corei 2.75 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS define(`rp', `%rdi') diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm index 9aa8af97b..f5203767b 100644 --- a/mpn/x86_64/aorrlsh_n.asm +++ b/mpn/x86_64/aorrlsh_n.asm @@ -23,10 +23,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l) -C K10: 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l) -C P4: 14 -C P6-15: 4 +C AMD K8,K9 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l) +C AMD K10 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l) +C Intel P4 14 +C Intel core2 4 +C Intel corei ? +C Intel atom ? +C VIA nano ? C This was written quickly and not optimized at all. Surely one could get C closer to 3 c/l or perhaps even under 3 c/l. Ideas: diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm index f2365ab45..be649868f 100644 --- a/mpn/x86_64/aors_n.asm +++ b/mpn/x86_64/aors_n.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 1.5 -C K10: 1.5 -C P4: ? -C P6 core2: 4.9 -C P6 corei7: -C P6 atom: 4 +C AMD K8,K9 1.5 +C AMD K10 1.5 +C Intel P4 ? +C Intel core2 4.9 +C Intel corei ? +C Intel atom 4 +C VIA nano 3.25 C The inner loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm index cbf4b4693..3bdba0a06 100644 --- a/mpn/x86_64/aorsmul_1.asm +++ b/mpn/x86_64/aorsmul_1.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 2.5 -C K10: 2.5 -C P4: 14.9 -C P6 core2: 5.09 -C P6 corei7: -C P6 atom: 21.3 +C AMD K8,K9 2.5 +C AMD K10 2.5 +C Intel P4 14.9 +C Intel core2 5.09 +C Intel corei ? +C Intel atom 21.3 +C VIA nano 5.5 C The inner loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. diff --git a/mpn/x86_64/atom/aors_n.asm b/mpn/x86_64/atom/aors_n.asm index 32c19424f..72e231b17 100644 --- a/mpn/x86_64/atom/aors_n.asm +++ b/mpn/x86_64/atom/aors_n.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 1.85 -C K10: ? -C P4: ? -C P6-15 (Core2): ? -C P6-28 (Atom): 3 +C AMD K8,K9 1.85 +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom 3 +C VIA nano ? C INPUT PARAMETERS define(`rp', `%rdi') diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm index d6775aef9..72173ce85 100644 --- a/mpn/x86_64/bdiv_dbm1c.asm +++ b/mpn/x86_64/bdiv_dbm1c.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 2.25 -C K10: ? -C P4: 12.5 -C P6 core2: 4.0 -C P6 corei7: 3.8 -C P6 atom: 20 +C AMD K8,K9 2.25 +C AMD K10 ? +C Intel P4 12.5 +C Intel core2 4.0 +C Intel corei 3.8 +C Intel atom 20 +C VIA nano 4.2 C TODO C * Do proper 4-way feed-in instead of the current epilogue diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm index 2356f2bc0..01624a52a 100644 --- a/mpn/x86_64/bdiv_q_1.asm +++ b/mpn/x86_64/bdiv_q_1.asm @@ -23,12 +23,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 10 -C K10: 10 -C P4: 33 -C P6 core2: 13.25 -C P6 corei7: 14 -C P6 atom: 42 +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13.25 +C Intel corei 14 +C Intel atom 42 +C VIA nano ? C INPUT PARAMETERS diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm index 699da11b6..db968e96a 100644 --- a/mpn/x86_64/com.asm +++ b/mpn/x86_64/com.asm @@ -21,10 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 1.25 -C K10: 1.25 -C P4: 2.78 -C P6-15: 1.1 +C AMD K8,K9 1.25 +C AMD K10 1.25 +C Intel P4 2.78 +C Intel core2 1.1 +C Intel corei 1.5 +C Intel atom ? +C VIA nano 2 C INPUT PARAMETERS define(`rp',`%rdi') diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm index f5c451cfe..89b26aa16 100644 --- a/mpn/x86_64/copyd.asm +++ b/mpn/x86_64/copyd.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 1 -C K10: 1 -C P4: 2.8 -C P6 core2: 1.2 -C P6 corei7: 1 +C AMD K8,K9 1 +C AMD K10 1 +C Intel P4 2.8 +C Intel core2 1.2 +C Intel corei ? +C Intel atom ? +C VIA nano 2 C INPUT PARAMETERS diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm index 506142be7..c7c46c02d 100644 --- a/mpn/x86_64/copyi.asm +++ b/mpn/x86_64/copyi.asm @@ -21,10 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 1 -C K10: 1 -C P4: 2.8 -C P6-15: 1.2 +C AMD K8,K9 1 +C AMD K10 1 +C Intel P4 2.8 +C Intel core2 1.2 +C Intel corei ? +C Intel atom ? +C VIA nano 2 C INPUT PARAMETERS diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm index 3dc04d0b7..8a20eba65 100644 --- a/mpn/x86_64/core2/aors_n.asm +++ b/mpn/x86_64/core2/aors_n.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.25 -C K10: 2 -C P4: 10 -C P6 core2: 2.05 -C P6 corei7: 2.3 +C AMD K8,K9 2.25 +C AMD K10 2 +C Intel P4 10 +C Intel core2 2.05 +C Intel corei 2.3 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS define(`rp', `%rdi') diff --git a/mpn/x86_64/core2/aorslsh1_n.asm b/mpn/x86_64/core2/aorslsh1_n.asm index 18db7c96f..b8c752b3a 100644 --- a/mpn/x86_64/core2/aorslsh1_n.asm +++ b/mpn/x86_64/core2/aorslsh1_n.asm @@ -20,10 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 4.25 -C K10: ? -C P4: ? -C P6-15: 3 +C AMD K8,K9 4.25 +C AMD K10 ? +C Intel P4 ? +C Intel core2 3 +C Intel corei 3 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS define(`rp',`%rdi') diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm index 8dcccd994..bb4f663c4 100644 --- a/mpn/x86_64/core2/aorsmul_1.asm +++ b/mpn/x86_64/core2/aorsmul_1.asm @@ -20,11 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 4 -C K10: 4 -C P4: ? -C P6 core2: 4.3-4.5 (fluctuating) -C P6 corei7: 5 +C AMD K8,K9 4 +C AMD K10 4 +C Intel P4 ? +C Intel core2 4.3-4.5 (fluctuating) +C Intel corei 5 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS define(`rp', `%rdi') diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm index e3e400874..a15868b03 100644 --- a/mpn/x86_64/core2/lshift.asm +++ b/mpn/x86_64/core2/lshift.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 4.25 -C K10: 4.25 -C P4: 14.7 -C P6 core2: 1.27 -C P6 corei7: 1.5 +C AMD K8,K9 4.25 +C AMD K10 4.25 +C Intel P4 14.7 +C Intel core2 1.27 +C Intel corei 1.5 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm index bc014c855..a2d905dc8 100644 --- a/mpn/x86_64/core2/lshiftc.asm +++ b/mpn/x86_64/core2/lshiftc.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: ? -C K10: ? -C P4: ? -C P6 core2: 1.5 -C P6 corei7: 1.75 +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 1.5 +C Intel corei 1.75 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm index 485fd4b0d..bc6224064 100644 --- a/mpn/x86_64/core2/rshift.asm +++ b/mpn/x86_64/core2/rshift.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 4.25 -C K10: 4.25 -C P4: 14.7 -C P6 core2: 1.27 -C P6 corei7: 1.5 +C AMD K8,K9 4.25 +C AMD K10 4.25 +C Intel P4 14.7 +C Intel core2 1.27 +C Intel corei 1.5 +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS diff --git a/mpn/x86_64/dive_1.asm b/mpn/x86_64/dive_1.asm index f3b6ac8fa..eb955c1ec 100644 --- a/mpn/x86_64/dive_1.asm +++ b/mpn/x86_64/dive_1.asm @@ -21,12 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 10 -C K10: 10 -C P4: 33 -C P6 core2: 13.25 -C P6 corei7: 14 -C P6 atom: 42 +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13.25 +C Intel corei 14 +C Intel atom 42 +C VIA nano ? C A quick adoption of the 32-bit K7 code. diff --git a/mpn/x86_64/divrem_2.asm b/mpn/x86_64/divrem_2.asm index 88ffe9c74..6f5084f9b 100644 --- a/mpn/x86_64/divrem_2.asm +++ b/mpn/x86_64/divrem_2.asm @@ -21,10 +21,13 @@ include(`../config.m4') C norm frac -C K8 20 20 -C P4 73 73 -C P6 core2 37 37 -C P6 corei7 33 33 +C AMD K8,K9 20 20 +C AMD K10 20 20 +C Intel P4 73 73 +C Intel core2 37 37 +C Intel corei 33 33 +C Intel atom ? ? +C VIA nano ? ? C TODO C * Perhaps compute the inverse without relying on divq? Could either use diff --git a/mpn/x86_64/invert_limb.asm b/mpn/x86_64/invert_limb.asm index 8dcfae0b0..471ce685d 100644 --- a/mpn/x86_64/invert_limb.asm +++ b/mpn/x86_64/invert_limb.asm @@ -23,12 +23,13 @@ include(`../config.m4') C cycles/limb (approx) div -C K8,K9: 48 71 -C K10: 48 77 -C P4: 135 161 -C P6 core2: 69 116 -C P6 corei7: 55 89 -C P6 atom: 129 191 +C AMD K8,K9 48 71 +C AMD K10 48 77 +C Intel P4 135 161 +C Intel core2 69 116 +C Intel corei 55 89 +C Intel atom 129 191 +C VIA nano 79 ? C rax rcx rdx rdi rsi r8 diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm index 1022b6137..63f82fd93 100644 --- a/mpn/x86_64/logops_n.asm +++ b/mpn/x86_64/logops_n.asm @@ -21,10 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 1.5 -C K10: 1.75-2 (fluctuating) -C P4: 2.8/3.35/3.60 (variant1/variant2/variant3) -C P6-15: 2.0 +C AMD K8,K9 1.5 +C AMD K10 1.75-2 (fluctuating) +C Intel P4 2.8/3.35/3.60 (variant1/variant2/variant3) +C Intel core2 2 +C Intel corei 2 +C Intel atom ? +C VIA nano 3.25 ifdef(`OPERATION_and_n',` define(`func',`mpn_and_n') diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm index d59d8250a..2f3d5c94d 100644 --- a/mpn/x86_64/lshift.asm +++ b/mpn/x86_64/lshift.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb cycles/limb cnt=1 -C K8,K9: 2.375 1.375 -C K10: 2.375 1.375 -C P4: 8 10.5 -C P6-15 (Core2): 2.11 4.28 -C P6-28 (Atom): 5.75 3.5 +C AMD K8,K9 2.375 1.375 +C AMD K10 2.375 1.375 +C Intel P4 8 10.5 +C Intel core2 2.11 4.28 +C Intel corei ? ? +C Intel atom 5.75 3.5 +C VIA nano 3.5 2.25 C INPUT PARAMETERS diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm index 2423529c4..a6b948196 100644 --- a/mpn/x86_64/lshiftc.asm +++ b/mpn/x86_64/lshiftc.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.75 -C K10: 2.75 -C P4: ? -C P6-15 (Core2): ? -C P6-28 (Atom): ? +C AMD K8,K9 2.75 +C AMD K10 2.75 +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano 3.75 C INPUT PARAMETERS diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm index 6ae7c3640..e00038dea 100644 --- a/mpn/x86_64/lshsub_n.asm +++ b/mpn/x86_64/lshsub_n.asm @@ -21,10 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) -C K10: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) -C P4: 16.5 -C P6-15: 4.35 +C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C Intel P4 16.5 +C Intel core2 4.35 +C Intel corei ? +C Intel atom ? +C VIA nano ? C This was written quickly and not optimized at all, but it runs very well on C K8. But perhaps one could get under 3 c/l. Ideas: diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm index fac7b412c..27f1e16c9 100644 --- a/mpn/x86_64/mod_1_1.asm +++ b/mpn/x86_64/mod_1_1.asm @@ -22,12 +22,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 7 -C K10: 7 -C P4: 27 -C P6 core2: 14 -C P6 corei: 12.5 -C P6 atom: 37 +C AMD K8,K9 7 +C AMD K10 7 +C Intel P4 27 +C Intel core2 14 +C Intel corei 12.5 +C Intel atom 37 C VIA nano 15 ASM_START() diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm index 77a05e0bd..41ac55a1a 100644 --- a/mpn/x86_64/mod_1_2.asm +++ b/mpn/x86_64/mod_1_2.asm @@ -22,12 +22,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 4 -C K10: 4 -C P4: 19 -C P6 core2: 8 -C P6 corei: 6.5 -C P6 atom: 28 +C AMD K8,K9 4 +C AMD K10 4 +C Intel P4 19 +C Intel core2 8 +C Intel corei 6.5 +C Intel atom 28 C VIA nano 8 ASM_START() diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm index 417991a6b..d2eb06837 100644 --- a/mpn/x86_64/mod_1_4.asm +++ b/mpn/x86_64/mod_1_4.asm @@ -22,12 +22,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 3 -C K10: 3 -C P4: 15.5 -C P6 core2: 5 -C P6 corei: 4.25 -C P6 atom: 23 +C AMD K8,K9 3 +C AMD K10 3 +C Intel P4 15.5 +C Intel core2 5 +C Intel corei 4.25 +C Intel atom 23 C VIA nano 5 ASM_START() diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm index 318fb96d6..59a960863 100644 --- a/mpn/x86_64/mod_34lsub1.asm +++ b/mpn/x86_64/mod_34lsub1.asm @@ -22,11 +22,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 1.0 -C K10: 1.12 -C P4: 3.25 -C P6-15 (Core2): 1.5 -C P6-28 (Atom): 2.5 +C AMD K8,K9 1.0 +C AMD K10 1.12 +C Intel P4 3.25 +C Intel core2 1.5 +C Intel corei 1.5 +C Intel atom 2.5 +C VIA nano 1.75 C INPUT PARAMETERS diff --git a/mpn/x86_64/mode1o.asm b/mpn/x86_64/mode1o.asm index ae5f83c29..d2e82b094 100644 --- a/mpn/x86_64/mode1o.asm +++ b/mpn/x86_64/mode1o.asm @@ -22,12 +22,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 10 -C K10: 10 -C P4: 33 -C P6 core2: 13 -C P6 corei7: 14.5 -C P6 Atom: 35 +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13 +C Intel corei 14.5 +C Intel atom 35 +C VIA nano ? C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm index a0c45990e..29340c390 100644 --- a/mpn/x86_64/mul_1.asm +++ b/mpn/x86_64/mul_1.asm @@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 2.5 -C K10: 2.5 -C P4: 12.3 -C P6 core2: 4.0 -C P6 corei7: 3.8 -C Atom: 19.8 +C AMD K8,K9 2.5 +C AMD K10 2.5 +C Intel P4 12.3 +C Intel core2 4.0 +C Intel corei 3.8 +C Intel atom 19.8 +C VIA nano ? C The inner loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm index ab87aaf21..7f8d0a167 100644 --- a/mpn/x86_64/mul_2.asm +++ b/mpn/x86_64/mul_2.asm @@ -21,11 +21,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 2.275 -C K10: 2.275 -C P4: ? -C P6 core2: 4.0 -C P6 corei7: 3.8 +C AMD K8,K9 2.275 +C AMD K10 2.275 +C Intel P4 ? +C Intel core2 4.0 +C Intel corei 3.8 +C Intel atom ? +C VIA nano ? C This code is the result of running a code generation and optimization tool C suite written by David Harvey and Torbjorn Granlund. diff --git a/mpn/x86_64/mul_basecase.asm b/mpn/x86_64/mul_basecase.asm index 532076629..bc0ea1933 100644 --- a/mpn/x86_64/mul_basecase.asm +++ b/mpn/x86_64/mul_basecase.asm @@ -22,10 +22,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 2.375 -C K10: 2.375 -C P4: ? -C P6-15: 4.45 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 ? +C Intel core2 4.45 +C Intel corei ? +C Intel atom ? +C VIA nano ? C The inner loops of this code are the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. diff --git a/mpn/x86_64/pentium4/aors_n.asm b/mpn/x86_64/pentium4/aors_n.asm index 90f5a219b..bc49aa90b 100644 --- a/mpn/x86_64/pentium4/aors_n.asm +++ b/mpn/x86_64/pentium4/aors_n.asm @@ -21,10 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.8 -C K10: 2.8 -C P4: 4 -C P6-15: 3.6-5 (fluctuating) +C AMD K8,K9 2.8 +C AMD K10 2.8 +C Intel P4 4 +C Intel core2 3.6-5 (fluctuating) +C Intel corei ? +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS diff --git a/mpn/x86_64/pentium4/aorslsh1_n.asm b/mpn/x86_64/pentium4/aorslsh1_n.asm index 0723f3e6c..cd5452768 100644 --- a/mpn/x86_64/pentium4/aorslsh1_n.asm +++ b/mpn/x86_64/pentium4/aorslsh1_n.asm @@ -21,10 +21,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C K8,K9: 3.8 -C K10: 4.8 -C P4: 5.8 -C P6-15: ? +C AMD K8,K9 3.8 +C AMD K10 4.8 +C Intel P4 5.8 +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano ? C INPUT PARAMETERS diff --git a/mpn/x86_64/pentium4/lshift.asm b/mpn/x86_64/pentium4/lshift.asm index 7596d9c5c..df52b95c6 100644 --- a/mpn/x86_64/pentium4/lshift.asm +++ b/mpn/x86_64/pentium4/lshift.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.5 -C K10: ? -C P4: 3.29 -C P6-15 (Core2): 2.1 (fluctuates, presumably cache related) -C P6-28 (Atom): 14.3 +C AMD K8,K9 2.5 +C AMD K10 ? +C Intel P4 3.29 +C Intel core2 2.1 (fluctuates, presumably cache related) +C Intel corei ? +C Intel atom 14.3 +C VIA nano ? C INPUT PARAMETERS define(`rp',`%rdi') diff --git a/mpn/x86_64/pentium4/rshift.asm b/mpn/x86_64/pentium4/rshift.asm index 61899c5ec..da0bb8b07 100644 --- a/mpn/x86_64/pentium4/rshift.asm +++ b/mpn/x86_64/pentium4/rshift.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.5 -C K10: ? -C P4: 3.29 -C P6-15 (Core2): 2.1 (fluctuates, presumably cache related) -C P6-28 (Atom): 14.3 +C AMD K8,K9 2.5 +C AMD K10 ? +C Intel P4 3.29 +C Intel core2 2.1 (fluctuates, presumably cache related) +C Intel corei ? +C Intel atom 14.3 +C VIA nano ? C INPUT PARAMETERS define(`rp',`%rdi') diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm index e2bdb1a0b..bd335f63f 100644 --- a/mpn/x86_64/popham.asm +++ b/mpn/x86_64/popham.asm @@ -23,10 +23,13 @@ include(`../config.m4') C popcount hamdist C cycles/limb cycles/limb -C K8,K9: 6 7 -C K10: 6 7 -C P4: 12 14.3 -C P6-15: 7 8 +C AMD K8,K9 6 7 +C AMD K10 6 7 +C Intel P4 12 14.3 +C Intel core2 7 8 +C Intel corei ? 7.3 +C Intel atom ? ? +C VIA nano 9.25 11 C TODO C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm index ceaadea01..08dd5baa3 100644 --- a/mpn/x86_64/redc_1.asm +++ b/mpn/x86_64/redc_1.asm @@ -22,11 +22,13 @@ include(`../config.m4') C cycles/limb C cycles/limb -C K8,K9: 2.5 -C K10: 2.5 -C P4: ? -C P6-15 (Core2): 5.3 -C P6-28 (Atom): ? +C AMD K8,K9 2.5 +C AMD K10 2.5 +C Intel P4 ? +C Intel core2 5.3 +C Intel corei ? +C Intel atom ? +C VIA nano ? C TODO C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code. diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm index 41e67e371..1841d1a75 100644 --- a/mpn/x86_64/rsh1aors_n.asm +++ b/mpn/x86_64/rsh1aors_n.asm @@ -21,10 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.14 (mpn_add_n + mpn_rshift need 4.125) -C K10: 2.14 (mpn_add_n + mpn_rshift need 4.125) -C P4: 12.75 -C P6-15: 3.75 +C AMD K8,K9 2.14 (mpn_add_n + mpn_rshift need 4.125) +C AMD K10 2.14 (mpn_add_n + mpn_rshift need 4.125) +C AMD P4 12.75 +C Intel core2 3.75 +C Intel corei ? +C Intel atom ? +C VIA nano 3.25 C TODO C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm. diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm index 3b1586828..0f822a4a0 100644 --- a/mpn/x86_64/rshift.asm +++ b/mpn/x86_64/rshift.asm @@ -21,11 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.375 -C K10: 2.375 -C P4: 8 -C P6-15 (Core2): 2.11 -C P6-28 (Atom): 5.75 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 8 +C Intel core2 2.11 +C Intel corei ? +C Intel atom 5.75 +C VIA nano 3.5 C INPUT PARAMETERS diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm index 6f67fae36..a2f48c007 100644 --- a/mpn/x86_64/sublsh1_n.asm +++ b/mpn/x86_64/sublsh1_n.asm @@ -21,13 +21,13 @@ include(`../config.m4') C cycles/limb -C K8,K9: 2.2 -C K10: 2.2 -C P4: 12.75 -C P6 core2: 3.45 -C P6 corei7: 3.45 -C P6 atom: ? - +C AMD K8,K9 2.2 +C AMD K10 2.2 +C Intel P4 12.75 +C Intel core2 3.45 +C Intel corei ? +C Intel atom ? +C VIA nano 3.25 C Sometimes speed degenerates, supposedly related to that some operand C alignments cause cache conflicts. |