1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
|
; vim:filetype=nasm ts=8
; libFLAC - Free Lossless Audio Codec library
; Copyright (C) 2001-2009 Josh Coalson
; Copyright (C) 2011-2013 Xiph.Org Foundation
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
;
; - Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; - Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;
; - Neither the name of the Xiph.org Foundation nor the names of its
; contributors may be used to endorse or promote products derived from
; this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "nasm.h"
data_section
cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
code_section
; **********************************************************************
;
; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
; {
; FLAC__int32 last_error_0 = data[-1];
; FLAC__int32 last_error_1 = data[-1] - data[-2];
; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
; FLAC__int32 error, save;
; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
; unsigned i, order;
;
; for(i = 0; i < data_len; i++) {
; error = data[i] ; total_error_0 += local_abs(error); save = error;
; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
; }
;
; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
; order = 0;
; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
; order = 1;
; else if(total_error_2 < min(total_error_3, total_error_4))
; order = 2;
; else if(total_error_3 < total_error_4)
; order = 3;
; else
; order = 4;
;
; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
;
; return order;
; }
ALIGN 16
cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
; esp + 36 == data[]
; esp + 40 == data_len
; esp + 44 == residual_bits_per_sample[]
push ebp
push ebx
push esi
push edi
sub esp, byte 16
; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
; ebx == &data[i]
; ecx == loop counter (i)
; ebp == order
; mm0 == total_error_1:total_error_0
; mm1 == total_error_2:total_error_3
; mm2 == :total_error_4
; mm3 == last_error_1:last_error_0
; mm4 == last_error_2:last_error_3
mov ecx, [esp + 40] ; ecx = data_len
test ecx, ecx
jz near .data_len_is_0
mov ebx, [esp + 36] ; ebx = data[]
movd mm3, [ebx - 4] ; mm3 = 0:last_error_0
movd mm2, [ebx - 8] ; mm2 = 0:data[-2]
movd mm1, [ebx - 12] ; mm1 = 0:data[-3]
movd mm0, [ebx - 16] ; mm0 = 0:data[-4]
movq mm5, mm3 ; mm5 = 0:last_error_0
psubd mm5, mm2 ; mm5 = 0:last_error_1
punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0
psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3]
psubd mm5, mm2 ; mm5 = 0:last_error_2
movq mm4, mm5 ; mm4 = 0:last_error_2
psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3])
paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
psubd mm4, mm0 ; mm4 = 0:last_error_3
punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3
pxor mm0, mm0 ; mm0 = total_error_1:total_error_0
pxor mm1, mm1 ; mm1 = total_error_2:total_error_3
pxor mm2, mm2 ; mm2 = 0:total_error_4
ALIGN 16
.loop:
movd mm7, [ebx] ; mm7 = 0:error_0
add ebx, byte 4
movq mm6, mm7 ; mm6 = 0:error_0
psubd mm7, mm3 ; mm7 = :error_1
punpckldq mm6, mm7 ; mm6 = error_1:error_0
movq mm5, mm6 ; mm5 = error_1:error_0
movq mm7, mm6 ; mm7 = error_1:error_0
psubd mm5, mm3 ; mm5 = error_2:
movq mm3, mm6 ; mm3 = error_1:error_0
psrad mm6, 31
pxor mm7, mm6
psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0)
paddd mm0, mm7 ; mm0 = total_error_1:total_error_0
movq mm6, mm5 ; mm6 = error_2:
psubd mm5, mm4 ; mm5 = error_3:
punpckhdq mm5, mm6 ; mm5 = error_2:error_3
movq mm7, mm5 ; mm7 = error_2:error_3
movq mm6, mm5 ; mm6 = error_2:error_3
psubd mm5, mm4 ; mm5 = :error_4
movq mm4, mm6 ; mm4 = error_2:error_3
psrad mm6, 31
pxor mm7, mm6
psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3)
paddd mm1, mm7 ; mm1 = total_error_2:total_error_3
movq mm6, mm5 ; mm6 = :error_4
psrad mm5, 31
pxor mm6, mm5
psubd mm6, mm5 ; mm6 = :abs(error_4)
paddd mm2, mm6 ; mm2 = :total_error_4
dec ecx
jnz short .loop
; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
; order = 0;
; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
; order = 1;
; else if(total_error_2 < min(total_error_3, total_error_4))
; order = 2;
; else if(total_error_3 < total_error_4)
; order = 3;
; else
; order = 4;
movq mm3, mm0 ; mm3 = total_error_1:total_error_0
movd edi, mm2 ; edi = total_error_4
movd esi, mm1 ; esi = total_error_3
movd eax, mm0 ; eax = total_error_0
punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2
punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1
movd edx, mm1 ; edx = total_error_2
movd ecx, mm3 ; ecx = total_error_1
xor ebx, ebx
xor ebp, ebp
inc ebx
cmp ecx, eax
cmovb eax, ecx ; eax = min(total_error_0, total_error_1)
cmovbe ebp, ebx
inc ebx
cmp edx, eax
cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2)
cmovbe ebp, ebx
inc ebx
cmp esi, eax
cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
cmovbe ebp, ebx
inc ebx
cmp edi, eax
cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
cmovbe ebp, ebx
movd ebx, mm0 ; ebx = total_error_0
emms
; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
xor eax, eax
fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs)
.rbps_0:
test ebx, ebx
jz .total_error_0_is_0
fld1 ; ST = 1.0 data_len
mov [esp], ebx
mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0
mov ebx, [esp + 44]
fild qword [esp] ; ST = total_error_0 1.0 data_len
fdiv st2 ; ST = total_error_0/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len
fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len
jmp short .rbps_1
.total_error_0_is_0:
mov ebx, [esp + 44]
mov [ebx], eax ; residual_bits_per_sample[0] = 0.0
.rbps_1:
test ecx, ecx
jz .total_error_1_is_0
fld1 ; ST = 1.0 data_len
mov [esp], ecx
mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1
fild qword [esp] ; ST = total_error_1 1.0 data_len
fdiv st2 ; ST = total_error_1/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len
fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len
jmp short .rbps_2
.total_error_1_is_0:
mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0
.rbps_2:
test edx, edx
jz .total_error_2_is_0
fld1 ; ST = 1.0 data_len
mov [esp], edx
mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2
fild qword [esp] ; ST = total_error_2 1.0 data_len
fdiv st2 ; ST = total_error_2/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len
fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len
jmp short .rbps_3
.total_error_2_is_0:
mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0
.rbps_3:
test esi, esi
jz .total_error_3_is_0
fld1 ; ST = 1.0 data_len
mov [esp], esi
mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3
fild qword [esp] ; ST = total_error_3 1.0 data_len
fdiv st2 ; ST = total_error_3/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len
fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len
jmp short .rbps_4
.total_error_3_is_0:
mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0
.rbps_4:
test edi, edi
jz .total_error_4_is_0
fld1 ; ST = 1.0 data_len
mov [esp], edi
mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4
fild qword [esp] ; ST = total_error_4 1.0 data_len
fdiv st2 ; ST = total_error_4/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len
fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len
jmp short .rbps_end
.total_error_4_is_0:
mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0
.rbps_end:
fstp st0 ; ST = [empty]
jmp short .end
.data_len_is_0:
; data_len == 0, so residual_bits_per_sample[*] = 0.0
xor ebp, ebp
mov edi, [esp + 44]
mov [edi], ebp
mov [edi + 4], ebp
mov [edi + 8], ebp
mov [edi + 12], ebp
mov [edi + 16], ebp
add ebp, byte 4 ; order = 4
.end:
mov eax, ebp ; return order
add esp, byte 16
pop edi
pop esi
pop ebx
pop ebp
ret
end
|