1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
|
/*
* (c) Copyright 1986 HEWLETT-PACKARD COMPANY
*
* To anyone who acknowledges that this file is provided "AS IS"
* without any express or implied warranty:
* permission to use, copy, modify, and distribute this file
* for any purpose is hereby granted without fee, provided that
* the above copyright notice and this notice appears in all
* copies, and that the name of Hewlett-Packard Company not be
* used in advertising or publicity pertaining to distribution
* of the software without specific, written prior permission.
* Hewlett-Packard Company makes no representations about the
* suitability of this software for any purpose.
*/
/*
A faster strcpy.
by
Jerry Huck (aligned case)
Daryl Odnert (equal-alignment case)
Edgar Circenis (non-aligned case)
*/
/*
* strcpy(s1, s2)
*
* Copy string s2 to s1. s1 must be large enough.
* return s1
*/
#include "DEFS.h"
#define d_addr r26
#define s_addr r25
#define tmp6 r24
#define tmp1 r19
#define evenside r19
#define tmp2 r20
#define oddside r20
#define tmp3 r21
#define tmp4 r22
#define tmp5 arg3
#define save r1
ENTRY(strcpy)
/* Do some quick alignment checking on and fast path both word aligned */
extru,<> s_addr,31,2,tmp6 /*Is source word aligned? */
ldwm 4(0,s_addr),oddside /*Assume yes and guess that it
is double-word aligned. */
dep,= d_addr,29,2,tmp6 /*Is target word aligned? */
b case_analysis
copy d_addr,ret0
/* Both are aligned. First source word already loaded assuming that
source was oddword aligned. Fall through (therefore fastest) code
shuffles the registers to join the main loop */
bothaligned:
bb,>= s_addr,29,twoatatime /*Branch if source was odd aligned*/
uxor,nbz oddside,r0,save
/* Even aligned source. save holds that operand.
Do one iteration of the main copy loop juggling the registers to avoid
one copy. */
b,n nullfound
ldwm 4(s_addr),oddside
stwm save,4(d_addr)
uxor,nbz oddside,r0,save
b,n nullfound
ldwm 4(s_addr),evenside
stwm oddside,4(d_addr)
uxor,nbz evenside,r0,save
b,n nullfound
ldwm 4(s_addr),oddside
/* Main loop body. Entry expects evenside still to be stored, oddside
just loaded. */
loop:
stwm evenside,4(d_addr)
uxor,nbz oddside,r0,save
/* mid loop entry */
twoatatime:
b,n nullfound
ldwm 4(s_addr),evenside
stwm oddside,4(d_addr)
uxor,sbz evenside,r0,save
b loop
ldwm 4(s_addr),oddside
/* fall through when null found in evenside. oddside actually loaded */
nullfound: /* adjust d_addr and store final word */
extru,<> save,7,8,r0 /* pick up leftmost byte */
addib,tr,n 1,d_addr,store_final
extru,<> save,15,8,r0
addib,tr,n 2,d_addr,store_final
extru,<> save,23,8,r0
addib,tr 3,d_addr,store_final2
bv 0(rp)
stw save,0(d_addr)
store_final:
bv 0(rp)
store_final2:
stbys,e save,0(d_addr) /* delay slot */
case_analysis:
blr tmp6,r0
nop
/* NOTE: the delay slots for the non-aligned cases load a */
/* shift quantity which is TGT-SRC into tmp3. */
/* Note also, the case for both strings being word aligned */
/* is already checked before the BLR is executed, so that */
/* case can never occur. */
/* TGT SRC */
nop /* 00 00 can't happen */
nop
b neg_aligned_copy /* 00 01 */
ldi -1,tmp3 /* load shift quantity. delay slot */
b neg_aligned_copy /* 00 10 */
ldi -2,tmp3 /* load shift quantity. delay slot */
b neg_aligned_copy /* 00 11 */
ldi -3,tmp3 /* load shift quantity. delay slot */
b pos_aligned_copy0 /* 01 00 */
ldi 1,tmp3 /* load shift quantity. delay slot */
b equal_alignment_1 /* 01 01 */
ldbs,ma 1(s_addr),tmp1
b neg_aligned_copy /* 01 10 */
ldi -1,tmp3 /* load shift quantity. delay slot */
b neg_aligned_copy /* 01 11 */
ldi -2,tmp3 /* load shift quantity. delay slot */
b pos_aligned_copy0 /* 10 00 */
ldi 2,tmp3 /* load shift quantity. delay slot */
b pos_aligned_copy /* 10 01 */
ldi 1,tmp3 /* load shift quantity. delay slot */
b equal_alignment_2 /* 10 10 */
ldhs,ma 2(s_addr),tmp1
b neg_aligned_copy /* 10 11 */
ldi -1,tmp3 /* load shift quantity. delay slot */
b pos_aligned_copy0 /* 11 00 */
ldi 3,tmp3 /* load shift quantity. delay slot */
b pos_aligned_copy /* 11 01 */
ldi 2,tmp3 /* load shift quantity. delay slot */
b pos_aligned_copy /* 11 10 */
ldi 1,tmp3 /* load shift quantity. delay slot */
ldbs,ma 1(s_addr),tmp1 /* 11 11 */
comiclr,<> r0,tmp1,r0
bv 0(rp) /* return if 1st byte was null */
stbs,ma tmp1,1(d_addr) /* store a byte to dst string */
b bothaligned /* can now goto word_aligned */
ldwm 4(s_addr),oddside /* load next word of source */
equal_alignment_1:
comiclr,<> r0,tmp1,r0 /* nullify next if tmp1 <> 0 */
bv 0(rp) /* return if null byte found */
stbs,ma tmp1,1(d_addr) /* store a byte to dst string */
ldhs,ma 2(s_addr),tmp1 /* load next halfword */
equal_alignment_2:
extru,<> tmp1,23,8,tmp6 /* look at left byte of halfword */
bv 0(rp) /* return if 1st byte was null */
stbs,ma tmp6,1(d_addr)
extru,<> tmp1,31,8,r0
bv 0(rp) /* return if 2nd byte was null */
stbs,ma tmp1,1(d_addr)
b bothaligned
ldwm 4(s_addr),oddside /* load next word */
/* source and destination are not aligned, so we do it the hard way. */
/* target alignment is greater than source alignment */
pos_aligned_copy0:
addi -4,s_addr,s_addr
pos_aligned_copy:
extru d_addr,31,2,tmp6 /* Extract low 2 bits of the dest addr */
extru s_addr,31,2,tmp1 /* Extract low 2 bits of the src addr */
dep r0,31,2,s_addr /* Compute word address of the source. */
sh3add tmp3,r0,tmp4 /* compute shift amt */
ldwm 4(0,s_addr),tmp2 /* get 1st source word */
sh3add tmp1,r0,save /* setup mask shift amount */
mtctl save,r11 /* set-up cr11 for mask */
zvdepi -2,32,save /* create mask */
or save,tmp2,tmp2 /* mask unused bytes in src */
ldi -1,tmp1 /* load tmp1 with 0xffffffff */
mtctl tmp4,r11 /* shift count -> shift count reg */
vshd tmp1,tmp2,tmp3 /* position data ! */
uxor,nbz tmp3,r0,save
b,n first_null
uxor,nbz tmp2,r0,save
b nullfound1
mtctl tmp4,r11 /* re-load shift cnt (delay slot) */
b loop_entry
ldwm 4(0,s_addr),tmp1 /* get next word. delay slot */
neg_aligned_copy:
extru d_addr,31,2,tmp6 /* Extract low 2 bits of the dest addr */
extru s_addr,31,2,tmp2 /* Extract low 2 bits of the src addr */
dep r0,31,2,s_addr /* Compute word address of the source. */
sh3add tmp3,r0,tmp4 /* compute shift amt */
ldwm 4(0,s_addr),tmp1 /* load first word from source. */
/* check to see if next word can be read safely */
sh3add tmp2,r0,save
mtctl save,r11 /* shift count -> shift count reg */
zvdepi -2,32,save
or save, tmp1, tmp1
uxor,nbz tmp1,r0,save /* any nulls in first word? */
b first_null0
mtctl tmp4,r11
ldwm 4(0,s_addr),tmp2 /* load second word from source */
combt,= tmp6,r0,chunk1 /* don't mask if whole word valid */
vshd tmp1,tmp2,tmp3 /* position data ! */
sh3add tmp6,r0,save /* setup r1 */
mtctl save,r11 /* set-up cr11 for mask */
zvdepi -2,32,save
or save, tmp3, tmp3
uxor,nbz tmp3,r0,save
b,n first_null
uxor,nbz tmp2,r0,save
b nullfound1
mtctl tmp4,r11 /* re-load shift cnt (delay slot) */
b loop_entry
ldwm 4(0,s_addr),tmp1 /* get next word. delay slot */
chunk1:
uxor,nbz tmp2,r0,save
b nullfound0
vshd tmp1,tmp2,tmp3
did_mask:
ldwm 4(0,s_addr),tmp1 /* get next word ! */
loop_entry:
stbys,b,m tmp3,4(0,d_addr) /* store ! */
uxor,nbz tmp1, r0, save
b nullfound2
vshd tmp2,tmp1,tmp3 /* position data ! */
ldwm 4(s_addr),tmp2
stwm tmp3,4(d_addr)
uxor,sbz tmp2,r0,save
b did_mask
nullfound0:
vshd tmp1,tmp2,tmp3 /* delay slot */
uxor,nbz tmp3,r0,save
b,n nullfound
nullfound1:
stbys,b,m tmp3,4(0,d_addr)
b nullfound
vshd tmp2,r0,save /* delay slot */
nullfound2:
uxor,nbz tmp3,r0,save
b,n nullfound
stwm tmp3,4(d_addr)
b nullfound
/* notice that delay slot is in next routine */
first_null0: /* null found in first word of non-aligned (wrt d_addr) */
vshd tmp1,r0,save /* delay slot */
combt,= tmp6,r0,check4
extru save,7,8,tmp4
first_null:
addibt,= -1,tmp6,check3 /* check last 3 bytes of word */
extru save,15,8,tmp4
addibt,=,n -1,tmp6,check2 /* check last 2 bytes */
bv 0(rp) /* null in last byte--store and exit */
stbys,b save, 0(d_addr)
check4:
combt,= tmp4,r0,done
stbs,ma tmp4,1(d_addr)
extru,<> save,15,8,tmp4
check3:
combt,= tmp4,r0,done
stbs,ma tmp4,1(d_addr)
check2:
extru,<> save,23,8,tmp4
bv 0(rp)
stbs,ma tmp4,1(d_addr)
bv 0(rp)
stbs r0,0(d_addr)
done:
EXIT(strcpy)
|