dnl  PowerPC-64 mpn_addlshC_n and mpn_sublshC_n, where C is a small constant.

dnl  Copyright 2003, 2005, 2009, 2010 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

C		   cycles/limb
C POWER3/PPC630		 1.83	(1.5 c/l should be possible)
C POWER4/PPC970		 3	(2.0 c/l should be possible)
C POWER5		 3
C POWER6	      3.5-47

C STATUS
C  * Try combining upx+up, and vpx+vp.
C  * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
C    greater than the 2nd operand.  Yes, this addition is non-commutative wrt
C    performance.

C INPUT PARAMETERS
define(`rp', `r3')
define(`up', `r4')
define(`vp', `r5')
define(`n',  `r6')

define(`rpx', `r6')
define(`upx', `r7')
define(`vpx', `r12')

define(`s0', `r0')  define(`s1', `r9')
define(`u0', `r8')
define(`v0', `r10') define(`v1', `r11')


ASM_START()
PROLOGUE(func)
	cmpldi	cr0, n, 13
	bgt	L(big)

	mtctr	n		C copy n in ctr
	INITCY(	r0)		C clear cy

	ld	v0, 0(vp)	C load v limb
	ld	u0, 0(up)	C load u limb
	addi	up, up, -8	C update up
	addi	rp, rp, -8	C update rp
	sldi	s1, v0, LSH
	bdz	L(ex1)		C If done, skip loop

	ALIGN(16)
L(lo0):	ld	v1, 8(vp)	C load v limb
	ADDSUBE	s1, s1, u0	C add limbs with cy, set cy
	ldu	u0, 16(up)	C load u limb and update up
	srdi	s0, v0, RSH	C shift down previous v limb
	std	s1, 8(rp)	C store result limb
	rldimi	s0, v1, LSH, 0	C left shift v limb and merge with prev v limb
	bdz	L(ex0)		C decrement ctr and exit if done
	ldu	v0, 16(vp)	C load v limb and update vp
	ADDSUBE	s0, s0, u0	C add limbs with cy, set cy
	ld	u0, 8(up)	C load u limb
	srdi	s1, v1, RSH	C shift down previous v limb
	stdu	s0, 16(rp)	C store result limb and update rp
	rldimi	s1, v0, LSH, 0	C left shift v limb and merge with prev v limb
	bdnz	L(lo0)		C decrement ctr and loop back

L(ex1):	ADDSUBE	r7, s1, u0
	std	r7, 8(rp)	C store last result limb
	srdi	r0, v0, RSH
	RETVAL(	r0)
	blr
L(ex0):	ADDSUBE	r7, s0, u0
	std	r7, 16(rp)	C store last result limb
	srdi	r0, v1, RSH
	RETVAL(	r0)
	blr


L(big):	rldicl.	r0, n, 0,63	C r0 = n & 1, set cr0
	addi	r6, n, -1	C ...for ctr
	srdi	r6, r6, 1	C ...for ctr
	mtctr	r6		C copy count into ctr
	beq	cr0, L(b0)

L(b1):	ld	v1, 0(vp)
	ld	u0, 0(up)
	sldi	s1, v1, LSH
	srdi	s0, v1, RSH
	ld	v0, 8(vp)
	ADDSUBC	s1, s1, u0	C add limbs without cy, set cy
	addi	rpx, rp, -16
	addi	rp, rp, -8
	sub	upx, up, rp
	sub	vpx, vp, rp
	sub	up, up, rpx
	sub	vp, vp, rpx
	addi	up, up, 8
	addi	upx, upx, 16
	addi	vp, vp, 16
	addi	vpx, vpx, 24
	b	L(mid)

L(b0):	ld	v0, 0(vp)
	ld	u0, 0(up)
	sldi	s0, v0, LSH
	srdi	s1, v0, RSH
	ld	v1, 8(vp)
	ADDSUBC	s0, s0, u0	C add limbs without cy, set cy
	addi	rpx, rp, -8
	addi	rp, rp, -16
	sub	upx, up, rpx
	sub	vpx, vp, rpx
	sub	up, up, rp
	sub	vp, vp, rp
	addi	up, up, 8
	addi	upx, upx, 16
	addi	vp, vp, 16
	addi	vpx, vpx, 24

	ALIGN(32)
L(top):	ldx	u0, rp, up
	ldx	v0, rp, vp
	rldimi	s1, v1, LSH, 0
	stdu	s0, 16(rp)
	srdi	s0, v1, RSH
	ADDSUBE	s1, s1, u0	C add limbs with cy, set cy
L(mid):	ldx	u0, rpx, upx
	ldx	v1, rpx, vpx
	rldimi	s0, v0, LSH, 0
	stdu	s1, 16(rpx)
	srdi	s1, v0, RSH
	ADDSUBE	s0, s0, u0	C add limbs with cy, set cy
	bdnz	L(top)		C decrement CTR and loop back

	ldx	u0, rp, up
	rldimi	s1, v1, LSH, 0
	std	s0, 16(rp)
	srdi	s0, v1, RSH
	ADDSUBE	s1, s1, u0	C add limbs with cy, set cy
	std	s1, 24(rp)

	RETVAL(	r0)
	blr
EPILOGUE()