Add mpn_tabselect assembly support for powerpc64, x86, x86_64, ia64.

author: Torbjorn Granlund <tege@gmplib.org> 2011-11-15 00:53:06 +0100
committer: Torbjorn Granlund <tege@gmplib.org> 2011-11-15 00:53:06 +0100
commit: aebd2151218bded6e4278834b9f082808eef6590 (patch)
tree: 4a7f12237027bda254f48a12a5c674cd06c55417
parent: e1d8e2b8173bbd8e9b034722206979eef782df2c (diff)
download: gmp-aebd2151218bded6e4278834b9f082808eef6590.tar.gz
5 files changed, 452 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 0491b1574..b14d2a8da 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,10 +1,15 @@
 2011-11-15  Torbjorn Granlund  <tege@gmplib.org>
 
+	* mpn/powerpc64/tabselect.asm: New file.
+	* mpn/x86_64/tabselect.asm: New file.
+	* mpn/x86/tabselect.asm: New file.
+	* mpn/ia64/tabselect.asm: New file.
+
 	* mpn/asm-defs.m4 (define_mpn): Add tabselect.
 
 	* configure.in (gmp_mpn_functions): Add tabselect.
 	(HAVE_NATIVE): Add entries for addncd_n, subcnd_n, tabselect.
-	
+
 	* mpn/generic/powm_sec.c: Remove mpn_tabselect implementation.
 	* mpn/generic/tabselect.c: New file with removed code.
 
diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm
new file mode 100644
index 000000000..0ae3fdcfe
--- /dev/null
+++ b/mpn/ia64/tabselect.asm
@@ -0,0 +1,139 @@
+dnl  IA-64 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       ?
+C Itanium 2:     5  (estimated)
+
+C NOTES
+C  * Using software pipelining could trivially yield 3 c/l even without
+C    unrolling.  (This code was modelled after the powerpc64 code, for
+C    simplicity.)
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r32')
+define(`tp',     `r33')
+define(`n',      `r34')
+define(`nents',  `r35')
+define(`which',  `r36')
+
+define(`mask',   `r8')
+
+define(`rp1',     `r32')
+define(`tp1',     `r33')
+define(`rp2',     `r14')
+define(`tp2',     `r15')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+.mmi;	addp4	rp = 0, rp		C			M I
+	addp4	tp = 0, tp		C			M I
+	zxt4	n = n			C			I
+.mii;	nop	0
+	zxt4	nents = nents		C			I
+	zxt4	which = which		C			I
+	;;
+')
+.mmi;	add	rp2 = 8, rp1
+	add	tp2 = 8, tp1
+	add	r6 = -2, n
+	;;
+.mmi;	cmp.eq	p10, p0 = 1, n
+	and	r9 = 1, n		C set cr0 for use in inner loop
+	shr.u	r6 = r6, 1		C inner loop count
+	;;
+.mmi;	cmp.eq	p8, p0 = 0, r9
+	sub	which = nents, which
+	shl	n = n, 3
+	;;
+
+L(outer):
+.mmi	cmp.eq	p6, p7 = which, nents	C are we at the selected table entry?
+	nop	0
+	mov	ar.lc = r6		C			I0
+	;;
+.mmb;
+  (p6)	mov	mask = -1
+  (p7)	mov	mask = 0
+  (p8)	br.dptk	L(top)			C branch to loop entry if n even
+	;;
+
+.mmi;	ld8	r16 = [tp1], 8
+	add	tp2 = 8, tp2
+	nop	0
+	;;
+.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+	;;
+.mmi;	andcm	r18 = r18, mask
+	;;
+	or	r16 = r16, r18
+	nop	0
+	;;
+.mmb;	st8	[rp1] = r16, 8
+	add	rp2 = 8, rp2
+  (p10)	br.dpnt	L(end)
+
+	ALIGN(32)
+L(top):
+.mmi;	ld8	r16 = [tp1], 16
+	ld8	r17 = [tp2], 16
+	nop	0
+	;;
+.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+.mmi;	ld8	r19 = [rp2]
+	and	r17 = r17, mask
+	nop	0
+	;;
+.mmi;	andcm	r18 = r18, mask
+	andcm	r19 = r19, mask
+	nop	0
+	;;
+.mmi;	or	r16 = r16, r18
+	or	r17 = r17, r19
+	nop	0
+	;;
+.mmb;	st8	[rp1] = r16, 16
+	st8	[rp2] = r17, 16
+	br.cloop.dptk	L(top)
+	;;
+L(end):
+.mmi;	sub	rp1 = rp1, n		C move rp back to beginning
+	sub	rp2 = rp2, n		C move rp back to beginning
+	cmp.ne	p9, p0 = 1, nents
+.mmb;	add	nents = -1, nents
+	nop	0
+  (p9)	br.dptk	L(outer)
+	;;
+
+.mib;	nop	0
+	nop	0
+	br.ret.sptk.many b0
+EPILOGUE()
diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm
new file mode 100644
index 000000000..0ac2e9ba0
--- /dev/null
+++ b/mpn/powerpc64/tabselect.asm
@@ -0,0 +1,95 @@
+dnl  PowerPC-64 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r3')
+define(`tp',     `r4')
+define(`n',      `r5')
+define(`nents',  `r6')
+define(`which',  `r7')
+
+define(`mask',   `r8')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	addi	r0, n, 1
+	srdi	r0, r0, 1		C inner loop count
+	andi.	r9, n, 1		C set cr0 for use in inner loop
+	subf	which, nents, which
+	sldi	n, n, 3
+
+L(outer):
+	mtctr	r0			C put inner loop count in ctr
+
+	add	r9, which, nents	C are we at the selected table entry?
+	addic	r9, r9, -1		C set CF iff not selected entry
+	subfe	mask, r0, r0
+
+	beq	cr0, L(top)		C branch to loop entry if n even
+
+	ld	r9, 0(tp)
+	and	r9, r9, mask
+	ld	r11, 0(rp)
+	andc	r11, r11, mask
+	or	r9, r9, r11
+	std	r9, 0(rp)
+	addi	tp, tp, 8
+	addi	rp, rp, 8
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r9, 0(tp)
+	ld	r10, 8(tp)
+	and	r9, r9, mask
+	and	r10, r10, mask
+	ld	r11, 0(rp)
+	ld	r12, 8(rp)
+	andc	r11, r11, mask
+	andc	r12, r12, mask
+	or	r9, r9, r11
+	or	r10, r10, r12
+	std	r9, 0(rp)
+	std	r10, 8(rp)
+	addi	tp, tp, 16
+	addi	rp, rp, 16
+	bdnz	L(top)
+
+L(end):	subf	rp, n, rp		C move rp back to beginning
+	addi	nents, nents, -1
+	cmpdi	cr6, nents, 0
+	bne	cr6, L(outer)
+
+	blr
+EPILOGUE()
diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm
new file mode 100644
index 000000000..ab646dac3
--- /dev/null
+++ b/mpn/x86/tabselect.asm
@@ -0,0 +1,104 @@
+dnl  x86 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 ?
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C Intel Atom			 ?
+C AMD K6			 ?
+C AMD K7			 ?
+C AMD K8			 ?
+C AMD K10			 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%edi')
+define(`tp',     `%esi')
+define(`n',      `%ebx')
+define(`nents',  `%ecx')
+define(`which',  `36(%esp)')
+
+define(`i',      `%ebp')
+define(`maskp',  `20(%esp)')
+define(`maskn',  `32(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+	mov	20(%esp), rp
+	mov	24(%esp), tp
+	mov	28(%esp), n
+	mov	32(%esp), nents
+
+	lea	(rp,n,4), rp
+	lea	(tp,n,4), tp
+	sub	nents, which
+L(outer):
+	mov	which, %eax
+	add	nents, %eax
+	neg	%eax			C set CF iff 'which' != k
+	sbb	%eax, %eax
+	mov	%eax, maskn
+	not	%eax
+	mov	%eax, maskp
+
+	mov	n, i
+	neg	i
+
+	ALIGN(16)
+L(top):	mov	(tp,i,4), %eax
+	and	maskp, %eax
+	mov	(rp,i,4), %edx
+	and	maskn, %edx
+	or	%edx, %eax
+	mov	%eax, (rp,i,4)
+	inc	i
+	js	L(top)
+
+L(end):	mov	n, %eax
+	lea	(tp,%eax,4), tp
+	dec	nents
+	jne	L(outer)
+
+L(outer_end):
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
new file mode 100644
index 000000000..f7de6a85b
--- /dev/null
+++ b/mpn/x86_64/tabselect.asm
@@ -0,0 +1,108 @@
+dnl  AMD64 mpn_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2/AVX2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%rdi')
+define(`tp',     `%rsi')
+define(`n',      `%rdx')
+define(`nents',  `%rcx')
+define(`which',  `%r8')
+
+define(`i',      `%rbp')
+define(`maskp',  `%r11')
+define(`maskn',  `%r12')
+
+C rax rbx  rcx  rdx rdi rsi rbp (rsp)  r8   r9 r10 r11 r12 r13 r14 r15
+C         nents  n  rp  tab           which
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_tabselect)
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	lea	(rp,n,8), rp
+	lea	(tp,n,8), tp
+	sub	nents, which
+L(outer):
+	lea	(which,nents), %rax
+	neg	%rax			C set CF iff 'which' != k
+	sbb	maskn, maskn
+	mov	maskn, maskp
+	not	maskp
+
+	mov	n, i
+	neg	i
+	test	$1, R32(n)
+	je	L(top)
+	mov	(tp,i,8), %rax
+	and	maskp, %rax
+	mov	(rp,i,8), %r9
+	and	maskn, %r9
+	or	%r9, %rax
+	mov	%rax, (rp,i,8)
+	add	$1, i
+	jns	L(end)
+
+	ALIGN(16)
+L(top):	mov	(tp,i,8), %rax
+	mov	8(tp,i,8), %rbx
+	and	maskp, %rax
+	and	maskp, %rbx
+	mov	(rp,i,8), %r9
+	mov	8(rp,i,8), %r10
+	and	maskn, %r9
+	and	maskn, %r10
+	or	%r9, %rax
+	or	%r10, %rbx
+	mov	%rax, (rp,i,8)
+	mov	%rbx, 8(rp,i,8)
+	add	$2, i
+	js	L(top)
+
+L(end):	lea	(tp,n,8), tp
+	dec	nents
+	jne	L(outer)
+
+L(outer_end):
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()
author	Torbjorn Granlund <tege@gmplib.org>	2011-11-15 00:53:06 +0100
committer	Torbjorn Granlund <tege@gmplib.org>	2011-11-15 00:53:06 +0100
commit	aebd2151218bded6e4278834b9f082808eef6590 (patch)
tree	4a7f12237027bda254f48a12a5c674cd06c55417
parent	e1d8e2b8173bbd8e9b034722206979eef782df2c (diff)
download	gmp-aebd2151218bded6e4278834b9f082808eef6590.tar.gz