1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
/*
* (c) Copyright 1986 HEWLETT-PACKARD COMPANY
*
* To anyone who acknowledges that this file is provided "AS IS"
* without any express or implied warranty:
* permission to use, copy, modify, and distribute this file
* for any purpose is hereby granted without fee, provided that
* the above copyright notice and this notice appears in all
* copies, and that the name of Hewlett-Packard Company not be
* used in advertising or publicity pertaining to distribution
* of the software without specific, written prior permission.
* Hewlett-Packard Company makes no representations about the
* suitability of this software for any purpose.
*/
/* HPUX_ID: @(#) $Revision$ */
/*
* memcpy(s1, s2, n)
*
* Copy n characters from s2 to s1; returns s1.
*/
#define d_addr arg0
#define s_addr arg1
#define count arg2
#define tmp5 arg3
#define tmp1 r19
#define tmp2 r20
#define tmp3 r21
#define tmp4 r22
#define tmp6 r31
#include "DEFS.h"
ENTRY(memcpy)
comib,>= 5,count,byteloop /* If count is <= 6 don't get fancy.*/
movb,=,n d_addr,ret0,done /* The return value is defined to be the value of d_addr. DELAY SLOT */
/* if d_addr is null then exit */
extru s_addr,31,2,tmp1 /* Extract the low two bits of the source address. */
extru d_addr,31,2,tmp2 /* Extract the low two bits of the destination address. */
add count,tmp2,count /* pre increment the count to adjust for alignment of s1 */
comb,<> tmp2,tmp1,not_aligned /* see if s1 is aligned w.r.t. s2. */
dep 0,31,2,s_addr /* Compute the word address of the source. DELAY SLOT. */
/* aligned */
/* We will now begin the 16 byte at a time word move if count >= 16 ! */
/* Else we will branch to the 4 byte-at-a time word move ! */
addibt,<,n -16,count,chekchunk /* If count < 16 then we can't move 16 byte chunks ! */
/* actually we can legally move 13 or more bytes on the first loop. */
/* These loads and stores are done so as to prevent processor interlock. */
chunks:
ldwm 16(0,s_addr),tmp1 /* tmp1 = *s_addr s_addr += 16 */
ldw -12(0,s_addr),tmp2 /* tmp2 = 2nd word */
ldw -8(0,s_addr),tmp3 /* tmp3 = 3rd word */
ldw -4(0,s_addr),tmp4 /* tmp4 = 4th word */
/* Now store the results ! */
stbys,b,m tmp1,4(0,d_addr) /* tmp1 = 1st word stored d_addr += 16 also take care of front porch. */
stwm tmp2,4(0,d_addr) /* tmp2 = 2nd word stored. */
stwm tmp3,4(0,d_addr) /* tmp3 = 3rd word stored. */
addibf,< -16,count,chunks /* If count is still >= 16 do another loop. */
stwm tmp4,4(0,d_addr) /* tmp4 = 4th word stored. DELAY SLOT */
chekchunk:
addibt,<,n 12,count,back_porch /* since the count is already decremented by -16 we're testing */
/* to see if there are at least 4 bytes left ? */
subchunk:
ldws,ma 4(s_addr),tmp1 /* tmp1 = *s_addr++ */
addibf,< -4,count,subchunk /* count -= 4 */
stbys,b,m tmp1,4(d_addr) /* *d_addr++ = tmp1 */
back_porch:
addibt,=,n 4,count,done /* if count = 0 we're, of course, done ! */
ldws 0(s_addr),tmp1 /* load up the back_porch */
add d_addr,count,d_addr/* final store address is +1 too high ! */
bv 0(r2) /* return--were done. */
stbys,e tmp1,0(d_addr) /* kerplunk! whew ! */
/* Begin non_aligned code. (no refrence to politics) */
not_aligned:
sub,>= tmp2,tmp1,tmp3 /* compute the shift quantity again and skip the load if tmp2 > tmp1. */
ldwm 4(0,s_addr),tmp1 /* load up the first word from the source. tmp1 = *s_addr++ */
zdep tmp3,28,29,tmp4 /* compute the number of bits to shift based on the number of bytes above. */
mtctl tmp4,11 /* load the shift count into cr11 = shift count register. */
addibt,<,n -16,count,chkchnk2 /* first step in pre adjustment of count for looping. */
chunk2:
ldwm 16(0,s_addr),tmp2 /* get either first or second word . tmp2 = *s_addr++ */
ldw -12(s_addr),tmp3
ldw -8(s_addr),tmp4
ldw -4(s_addr),tmp5
vshd tmp1,tmp2,tmp6 /* position data ! */
stbys,b,m tmp6,4(0,d_addr) /* store ! */
vshd tmp2,tmp3,tmp6 /* position data ! */
stwm tmp6,4(0,d_addr) /* store ! */
vshd tmp3,tmp4,tmp6 /* position data ! */
stwm tmp6,4(0,d_addr) /* store ! */
vshd tmp4,tmp5,tmp6 /* position data ! */
stwm tmp6,4(0,d_addr) /* store the data ! */
addibf,< -16,count,chunk2 /* If count is still >= 16 do another loop. */
copy tmp5,tmp1
chkchnk2:
addibt,<,n 12,count,bp_0 /* if we don't have 4 bytes left then do the back porch (bp_0) */
subchnk2:
ldwm 4(0,s_addr),tmp2 /* get next word ! */
vshd tmp1,tmp2,tmp3 /* position data ! */
addibt,< -4,count,bp_1 /* decrement count and when count < 4 goto back_porch (bp_1) */
stbys,b,m tmp3,4(0,d_addr) /* store ! */
ldwm 4(0,s_addr),tmp1 /* get 4th word ! */
vshd tmp2,tmp1,tmp3 /* position data ! */
addib,>= -4,count,subchnk2 /* decrement count and when count <= 4 go to back porch (bp_2) */
stbys,b,m tmp3,4(0,d_addr) /* store the data ! */
bp_0: copy tmp1,tmp2 /* switch registers used in the shift process. */
bp_1: addibt,<=,n 4,count,done /* if count = -4 this implies that count = 0 -> done */
add d_addr,count,d_addr /* bump destination address to be +1 too high ! */
mfctl sar,tmp3 /* suppress final ldwm unless result used */
extru tmp3,28,2,tmp3 /* convert bitshift to byteshift */
sub,<= count,tmp3,r0 /* bytes unused if (count-byteshift <= 0*/
ldwm 4(0,s_addr),tmp1 /* get final word ! */
vshd tmp2,tmp1,tmp3 /* position data ! */
bv 0(r2) /* return */
stbys,e tmp3,0(0,d_addr) /* store the data ! */
/* here we do ye old byte-at-a-time moves. */
byteloop:
comb,>=,n 0,count,done
encore:
ldbs,ma 1(s_addr),tmp1
addibf,= -1,count,encore
stbs,ma tmp1,1(d_addr)
done:
EXIT(memcpy)
|