1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub.
# Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
# License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.
# INPUT PARAMETERS
# res_ptr r3
# s1_ptr r4
# s2_ptr r5
# size r6
include(`../config.m4')
define(SAVE_BORROW_RESTORE_CARRY,
`sldi $1,$1,63
adde $1,$1,$1')
define(SAVE_CARRY_RESTORE_BORROW,
`sldi $1,$1,63
adde $1,$1,$1')
# 19991117
# This is just crafted for testing some ideas, and verifying that we can make
# it run fast. It runs at 2.55 cycles/limb on the 630, which is very good.
# We should play a little with the schedule. No time has been spent on that.
# To finish this, the loop warm up and cool down code needs to be written,
# and the result need to be tested. Also, the proper calling sequence should
# be used.
# r1p r2p s1p s2p n
# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12
ASM_START()
PROLOGUE(mpn_addsub_n)
std r14,-64(r1)
std r15,-56(r1)
std r16,-48(r1)
std r17,-40(r1)
std r18,-32(r1)
std r19,-24(r1)
srdi r7,r7,2
mtctr r7 # copy size into CTR
addic r0,r0,0 # clear cy
addi r3,r3,-8 # offset res_ptr, it's updated before it's used
addi r4,r4,-8 # offset res_ptr, it's updated before it's used
.Loop:
adde r12,r8,r9
std r12,8(r3)
adde r12,r10,r11
std r12,16(r3)
SAVE_CARRY_RESTORE_BORROW(r0)
subfe r12,r8,r9
std r12,8(r4)
ld r8,8(r5) # s1 L 1
ld r9,8(r6) # s2 L 1
subfe r12,r10,r11
std r12,16(r4)
ld r10,16(r5) # s1 L 2
ld r11,16(r6) # s2 L 2
# pair -------------------------
subfe r12,r14,r15
std r12,24(r4)
subfe r12,r16,r17
stdu r12,32(r4)
SAVE_BORROW_RESTORE_CARRY(r0)
adde r12,r14,r15
std r12,24(r3)
ld r14,24(r5) # s1 L 3
ld r15,24(r6) # s2 L 3
adde r12,r16,r17
stdu r12,32(r3)
ldu r16,32(r5) # s1 L 4
ldu r17,32(r6) # s2 L 4
bdnz .Loop
ld r14,-64(r1)
ld r15,-56(r1)
ld r16,-48(r1)
ld r17,-40(r1)
ld r18,-32(r1)
ld r19,-24(r1)
blr
EPILOGUE(mpn_addsub_n)
|