summaryrefslogtreecommitdiff
path: root/mpn/powerpc64/mode64/invert_limb.asm
blob: aed0a32abbfec9b4797f8dac3ac90d152be468f7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
dnl  PowerPC-64 mpn_invert_limb -- Invert a normalized limb.

dnl  Copyright 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C		   cycles/limb (approximate)
C POWER3/PPC630		80
C POWER4/PPC970		86
C POWER5		86
C POWER6	       170

ASM_START()
PROLOGUE(mpn_invert_limb)
	LEAL(	r12, approx_tab)
	srdi	r9, r3, 32
	rlwinm	r9, r9, 10, 23, 30	C (d >> 55) & 0x1fe
	srdi	r10, r3, 24		C d >> 24
	lis	r11, 0x1000
	rldicl	r8, r3, 0, 63		C d mod 2
	addi	r10, r10, 1		C d40
	sldi	r11, r11, 32		C 2^60
	srdi	r7, r3, 1		C d/2
	add	r7, r7, r8		C d63 = ceil(d/2)
	neg	r8, r8			C mask = -(d mod 2)
	lhzx	r0, r9, r12
	mullw	r9, r0, r0		C v0*v0
	sldi	r6, r0, 11		C v0 << 11
	addi	r0, r6, -1		C (v0 << 11) - 1
	mulld	r9, r9, r10		C v0*v0*d40
	srdi	r9, r9, 40		C v0*v0*d40 >> 40
	subf	r9, r9, r0		C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
	mulld	r0, r9, r10		C v1*d40
	sldi	r6, r9, 13		C v1 << 13
	subf	r0, r0, r11		C 2^60 - v1*d40
	mulld	r0, r0, r9		C v1 * (2^60 - v1*d40)
	srdi	r0, r0, 47		C v1 * (2^60 - v1*d40) >> 47
	add	r0, r0, r6		C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
	mulld	r11, r0, r7		C v2 * d63
	srdi	r10, r0, 1		C v2 >> 1
	sldi	r9, r0, 31		C v2 << 31
	and	r8, r10, r8		C (v2 >> 1) & mask
	subf	r8, r11, r8		C ((v2 >> 1) & mask) - v2 * d63
	mulhdu	r0, r8, r0		C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
	srdi	r0, r0, 1		C p1 >> 1
	add	r0, r0, r9		C v3 = (v2 << 31) + (p1 >> 1)
	nop
	mulhdu	r9, r0, r3
	mulld	r11, r0, r3
	addc	r10, r11, r3
	adde	r3, r9, r3
	subf	r3, r3, r0
	blr
EPILOGUE()

DEF_OBJECT(approx_tab)
        .short  0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
        .short  0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
        .short  0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
        .short  0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
        .short  0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
        .short  0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
        .short  0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
        .short  0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
        .short  0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
        .short  0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
        .short  0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
        .short  0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
        .short  0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
        .short  0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
        .short  0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
        .short  0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
        .short  0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
        .short  0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
        .short  0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
        .short  0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
        .short  0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
        .short  0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
        .short  0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
        .short  0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
        .short  0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
        .short  0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
        .short  0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
        .short  0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
        .short  0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
        .short  0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
        .short  0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
        .short  0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
END_OBJECT(approx_tab)
ASM_END()