* mpn/x86/pentium4/mmx/popham.asm: New file.

author: Kevin Ryde <user42@zip.com.au> 2001-11-02 00:52:56 +0100
committer: Kevin Ryde <user42@zip.com.au> 2001-11-02 00:52:56 +0100
commit: f7349743ef6d0dca00f2b039b67dc7bfd98fdcb5 (patch)
tree: f04d7bdf0d0d778adfb67dfac61e01e327e773e6 /mpn
parent: 1c253604cd43921af2635daca3b9d637b9adcdf8 (diff)
download: gmp-f7349743ef6d0dca00f2b039b67dc7bfd98fdcb5.tar.gz
1 files changed, 188 insertions, 0 deletions
diff --git a/mpn/x86/pentium4/mmx/popham.asm b/mpn/x86/pentium4/mmx/popham.asm
new file mode 100644
index 000000000..708391143
--- /dev/null
+++ b/mpn/x86/pentium4/mmx/popham.asm
@@ -0,0 +1,188 @@
+dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+dnl 
+dnl  P4: popcount 8.5 cycles/limb
+dnl      hamdist  9.5 cycles/limb
+
+dnl  Copyright 2000, 2001 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
+C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
+C and using them saves fiddling about with alignment testing on entry.
+C
+C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
+C might be possible, but 8.5 c/l relying on out-of-order execution is
+C already quite reasonable.
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2,  8)
+defframe(PARAM_SRC,   4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,  8)
+defframe(PARAM_SRC,   4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+	RODATA
+	ALIGN(8)
+LF(M4_function,rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+LF(M4_function,rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+LF(M4_function,rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %edx
+	movd	%edx, %mm7
+	punpckldq %mm7, %mm7
+
+	movl	$0x33333333, %edx
+	movd	%edx, %mm6
+	punpckldq %mm6, %mm6
+
+	movl	$0x0F0F0F0F, %edx
+	movd	%edx, %mm5
+	punpckldq %mm5, %mm5
+
+HAM(`	movl	PARAM_SRC2, %edx')
+
+',`
+	dnl non-PIC
+HAM(`	movl	PARAM_SRC2, %edx')
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+
+	pxor	%mm4, %mm4		C zero
+	pxor	%mm0, %mm0		C total
+
+	subl	$1, %ecx
+	ja	L(top)
+
+L(last):
+	movd	(%eax,%ecx,4), %mm1		C src high limb
+HAM(`	movd	(%edx,%ecx,4), %mm2
+	pxor	%mm2, %mm1
+')
+	jmp	L(loaded)
+
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, size-1 to 2 or 1, inclusive
+	C edx	[hamdist] src2
+	C
+	C mm0	total (low dword)
+	C mm1	(scratch)
+	C mm2	(scratch)
+	C mm3
+	C mm4	0x0000000000000000
+	C mm5	0x0F0F0F0F0F0F0F0F
+	C mm6	0x3333333333333333
+	C mm7	0xAAAAAAAAAAAAAAAA
+
+	movd	(%eax), %mm1
+	movd	4(%eax), %mm2
+	punpckldq %mm2, %mm1
+	addl	$8, %eax
+
+HAM(`	movd	(%edx), %mm2
+	movd	4(%edx), %mm3
+	punpckldq %mm3, %mm2
+	pxor	%mm2, %mm1
+	addl	$8, %edx
+')
+
+L(loaded):
+	movq	%mm7, %mm2
+	pand	%mm1, %mm2
+	psrlq	$1, %mm2
+	psubd	%mm2, %mm1	C bit pairs
+
+	movq	%mm6, %mm2
+	pand	%mm1, %mm2
+	psrlq	$2, %mm1
+	pand	%mm6, %mm1
+	paddd	%mm2, %mm1	C nibbles
+
+	movq	%mm5, %mm2
+	pand	%mm1, %mm2
+	psrlq	$4, %mm1
+	pand	%mm5, %mm1
+	paddd	%mm2, %mm1	C bytes
+
+	psadbw(	%mm4, %mm1)
+	paddd	%mm1, %mm0	C to total
+
+	subl	$2, %ecx
+	jg	L(top)
+
+	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
+	jz	L(last)
+
+
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()
author	Kevin Ryde <user42@zip.com.au>	2001-11-02 00:52:56 +0100
committer	Kevin Ryde <user42@zip.com.au>	2001-11-02 00:52:56 +0100
commit	f7349743ef6d0dca00f2b039b67dc7bfd98fdcb5 (patch)
tree	f04d7bdf0d0d778adfb67dfac61e01e327e773e6 /mpn
parent	1c253604cd43921af2635daca3b9d637b9adcdf8 (diff)
download	gmp-f7349743ef6d0dca00f2b039b67dc7bfd98fdcb5.tar.gz