packages/extra/hermes/i386/mmx_clr.as


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271

#
# MMX surface clear routines for HERMES
# Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
# This source code is licensed under the GNU LGPL
# 
# Please refer to the file COPYING.LIB contained in the distribution for
# licensing conditions
#


.globl _ClearMMX_32
.globl _ClearMMX_24
.globl _ClearMMX_16
.globl _ClearMMX_8

.text

##
## --------------------------------------------------------------------------
## HermesClearInterface (ebp+..)
##   0: char8 *dest
##   4: int32 value
##   8: unsigned int width (already checked to be >0!)
##  12: unsigned int height (already checked to be >0!)
##  16: int add


_ClearMMX_32: 
        pushl %ebp
        movl %esp,%ebp

        movl 8(%ebp),%ebp

        movl 4(%ebp),%eax       # pixel value   
        movd 4(%ebp),%mm0

        movl 12(%ebp),%edx      # height
        movq %mm0,%mm1

        psllq $32,%mm0
        movl (%ebp),%edi        # destination

        por %mm1,%mm0
_ClearMMX_32.L_y: 
        movl 8(%ebp),%ecx

        movl %ecx,%ebx

        shrl %ecx
        jz _ClearMMX_32.L_last

_ClearMMX_32.L_x: 
        movq %mm0,(%edi)
        addl $8,%edi

        decl %ecx
        jnz _ClearMMX_32.L_x


_ClearMMX_32.L_last: 
        testl $1,%ebx
        jz _ClearMMX_32.L_endline

        movl %eax,(%edi)
        addl $4,%edi

_ClearMMX_32.L_endline: 

        addl 16(%ebp),%edi

        decl %edx
        jnz _ClearMMX_32.L_y

        emms

        popl %ebp
        ret


_ClearMMX_24: 
        ret


_ClearMMX_16: 
        pushl %ebp
        movl %esp,%ebp

        movl 8(%ebp),%ebp

        movl 4(%ebp),%eax       # pixel value   
        movl 4(%ebp),%ebx

        movl 12(%ebp),%edx      # height
        movl (%ebp),%edi        # destination

        shll $16,%eax           # Duplicate pixel value
        andl $0x0ffff,%ebx

        orl %ebx,%eax

        movd %eax,%mm0
        movd %eax,%mm1

        psllq $32,%mm0

        por %mm1,%mm0
_ClearMMX_16.L_y: 
        movl 8(%ebp),%ecx

        testl $3,%edi           # Check if destination is aligned mod 4
        jz _ClearMMX_16.L_aligned

        movw %ax,(%edi)         # otherwise write one pixel
        addl $2,%edi

        decl %ecx
        jz _ClearMMX_16.L_endline

_ClearMMX_16.L_aligned: 
        movl %ecx,%ebx
        shrl $2,%ecx

        jz _ClearMMX_16.L_last

_ClearMMX_16.L_x: 
        movq %mm0,(%edi)
        addl $8,%edi

        decl %ecx
        jnz _ClearMMX_16.L_x

_ClearMMX_16.L_last: 
        andl $3,%ebx
        jz _ClearMMX_16.L_endline

        movw %ax,(%edi)         # Write trailing pixels
        addl $2,%edi
        decl %ebx
        jz _ClearMMX_16.L_endline

        movw %ax,(%edi)
        addl $2,%edi
        decl %ebx
        jz _ClearMMX_16.L_endline

        movw %ax,(%edi)
        addl $2,%edi
        decl %ebx
        jnz _ClearMMX_16.L_endline

_ClearMMX_16.L_endline: 
        addl 16(%ebp),%edi

        decl %edx
        jnz _ClearMMX_16.L_y

        emms

        popl %ebp
        ret


## Clear8_x86 isnt optimised fully yet as it seems to be a tiny bit slower
## than the C routine
_ClearMMX_8: 
        pushl %ebp
        movl %esp,%ebp

        movl 8(%ebp),%ebp

        movl 4(%ebp),%eax       # pixel value           
        movl 4(%ebp),%ebx

        movl 12(%ebp),%edx      # height
        andl $0x0ff,%ebx

        shll $8,%eax            # Put the byte pixel value in all four bytes
        movl (%ebp),%edi        # destination

        movb %bl,%al
        movb %bl,%bh

        shll $16,%eax

        movb %bh,%ah
        movb %bl,%al

        movd %eax,%mm0
        movd %eax,%mm1

        psllq $32,%mm0

        por %mm1,%mm0

_ClearMMX_8.L_y: 
        movl 8(%ebp),%ecx

        testl $3,%edi           # Align mod 4
        jz _ClearMMX_8.L_aligned

        movl %edi,%ebx

        andl $3,%ebx

        movb %al,(%edi)         # Unrolled (copy & paste), align and jump
        incl %edi               # if finished, faster than a loop...
        decl %ecx
        jz _ClearMMX_8.L_endline
        decl %ebx
        jz _ClearMMX_8.L_aligned

        movb %al,(%edi)         # Second pixel
        incl %edi
        decl %ecx
        jz _ClearMMX_8.L_endline
        decl %ebx
        jz _ClearMMX_8.L_aligned

        movb %al,(%edi)         # Third pixel
        incl %edi
        decl %ecx
        jz _ClearMMX_8.L_endline
        decl %ebx
        jz _ClearMMX_8.L_aligned

_ClearMMX_8.L_aligned: 
        movl %ecx,%ebx          # Store ecx for later

        shrl $3,%ecx            # We write 8 pixels at once
        jz _ClearMMX_8.L_last

_ClearMMX_8.L_x: 
        movq %mm0,(%edi)
        addl $8,%edi

        decl %ecx
        jnz _ClearMMX_8.L_x

_ClearMMX_8.L_last: 
        movl %ebx,%ecx          # Clean up trailing pixels

        andl $7,%ecx            # Could be up to 7 left
        jz _ClearMMX_8.L_endline

        testb $0b100,%cl        # If theres less than four jump
        jz _ClearMMX_8.L_lessthanfour

        movl %eax,(%edi)        # Otherwise write a dword
        addl $4,%edi

        subl $4,%ecx

_ClearMMX_8.L_lessthanfour: 
        rep
 stosb              # Clean up the very rest

_ClearMMX_8.L_endline: 
        addl 16(%ebp),%edi

        decl %edx
        jnz _ClearMMX_8.L_y

        emms

        popl %ebp
        ret