ce188d4d |
1 | ; vim:filetype=nasm ts=8 |
2 | |
3 | ; libFLAC - Free Lossless Audio Codec library |
4 | ; Copyright (C) 2001-2009 Josh Coalson |
5 | ; Copyright (C) 2011-2016 Xiph.Org Foundation |
6 | ; |
7 | ; Redistribution and use in source and binary forms, with or without |
8 | ; modification, are permitted provided that the following conditions |
9 | ; are met: |
10 | ; |
11 | ; - Redistributions of source code must retain the above copyright |
12 | ; notice, this list of conditions and the following disclaimer. |
13 | ; |
14 | ; - Redistributions in binary form must reproduce the above copyright |
15 | ; notice, this list of conditions and the following disclaimer in the |
16 | ; documentation and/or other materials provided with the distribution. |
17 | ; |
18 | ; - Neither the name of the Xiph.org Foundation nor the names of its |
19 | ; contributors may be used to endorse or promote products derived from |
20 | ; this software without specific prior written permission. |
21 | ; |
22 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
23 | ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
24 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
25 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
26 | ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
27 | ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
28 | ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
29 | ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
30 | ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
31 | ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
32 | ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
33 | |
34 | %include "nasm.h" |
35 | |
36 | data_section |
37 | |
38 | cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov |
39 | |
40 | code_section |
41 | |
42 | ; ********************************************************************** |
43 | ; |
44 | ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) |
45 | ; { |
46 | ; FLAC__int32 last_error_0 = data[-1]; |
47 | ; FLAC__int32 last_error_1 = data[-1] - data[-2]; |
48 | ; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); |
49 | ; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); |
50 | ; FLAC__int32 error, save; |
51 | ; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; |
52 | ; unsigned i, order; |
53 | ; |
54 | ; for(i = 0; i < data_len; i++) { |
55 | ; error = data[i] ; total_error_0 += local_abs(error); save = error; |
56 | ; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; |
57 | ; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; |
58 | ; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; |
59 | ; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; |
60 | ; } |
61 | ; |
62 | ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) |
63 | ; order = 0; |
64 | ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) |
65 | ; order = 1; |
66 | ; else if(total_error_2 < min(total_error_3, total_error_4)) |
67 | ; order = 2; |
68 | ; else if(total_error_3 < total_error_4) |
69 | ; order = 3; |
70 | ; else |
71 | ; order = 4; |
72 | ; |
73 | ; residual_bits_per_sample[0] = (float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); |
74 | ; residual_bits_per_sample[1] = (float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); |
75 | ; residual_bits_per_sample[2] = (float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); |
76 | ; residual_bits_per_sample[3] = (float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); |
77 | ; residual_bits_per_sample[4] = (float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); |
78 | ; |
79 | ; return order; |
80 | ; } |
81 | ALIGN 16 |
82 | cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov |
83 | |
84 | ; esp + 36 == data[] |
85 | ; esp + 40 == data_len |
86 | ; esp + 44 == residual_bits_per_sample[] |
87 | |
88 | push ebp |
89 | push ebx |
90 | push esi |
91 | push edi |
92 | sub esp, byte 16 |
93 | ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs |
94 | |
95 | ; ebx == &data[i] |
96 | ; ecx == loop counter (i) |
97 | ; ebp == order |
98 | ; mm0 == total_error_1:total_error_0 |
99 | ; mm1 == total_error_2:total_error_3 |
100 | ; mm2 == :total_error_4 |
101 | ; mm3 == last_error_1:last_error_0 |
102 | ; mm4 == last_error_2:last_error_3 |
103 | |
104 | mov ecx, [esp + 40] ; ecx = data_len |
105 | test ecx, ecx |
106 | jz near .data_len_is_0 |
107 | |
108 | mov ebx, [esp + 36] ; ebx = data[] |
109 | movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 |
110 | movd mm2, [ebx - 8] ; mm2 = 0:data[-2] |
111 | movd mm1, [ebx - 12] ; mm1 = 0:data[-3] |
112 | movd mm0, [ebx - 16] ; mm0 = 0:data[-4] |
113 | movq mm5, mm3 ; mm5 = 0:last_error_0 |
114 | psubd mm5, mm2 ; mm5 = 0:last_error_1 |
115 | punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0 |
116 | psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] |
117 | psubd mm5, mm2 ; mm5 = 0:last_error_2 |
118 | movq mm4, mm5 ; mm4 = 0:last_error_2 |
119 | psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3]) |
120 | paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3]) |
121 | psubd mm4, mm0 ; mm4 = 0:last_error_3 |
122 | punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3 |
123 | pxor mm0, mm0 ; mm0 = total_error_1:total_error_0 |
124 | pxor mm1, mm1 ; mm1 = total_error_2:total_error_3 |
125 | pxor mm2, mm2 ; mm2 = 0:total_error_4 |
126 | |
127 | ALIGN 16 |
128 | .loop: |
129 | movd mm7, [ebx] ; mm7 = 0:error_0 |
130 | add ebx, byte 4 |
131 | movq mm6, mm7 ; mm6 = 0:error_0 |
132 | psubd mm7, mm3 ; mm7 = :error_1 |
133 | punpckldq mm6, mm7 ; mm6 = error_1:error_0 |
134 | movq mm5, mm6 ; mm5 = error_1:error_0 |
135 | movq mm7, mm6 ; mm7 = error_1:error_0 |
136 | psubd mm5, mm3 ; mm5 = error_2: |
137 | movq mm3, mm6 ; mm3 = error_1:error_0 |
138 | psrad mm6, 31 |
139 | pxor mm7, mm6 |
140 | psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0) |
141 | paddd mm0, mm7 ; mm0 = total_error_1:total_error_0 |
142 | movq mm6, mm5 ; mm6 = error_2: |
143 | psubd mm5, mm4 ; mm5 = error_3: |
144 | punpckhdq mm5, mm6 ; mm5 = error_2:error_3 |
145 | movq mm7, mm5 ; mm7 = error_2:error_3 |
146 | movq mm6, mm5 ; mm6 = error_2:error_3 |
147 | psubd mm5, mm4 ; mm5 = :error_4 |
148 | movq mm4, mm6 ; mm4 = error_2:error_3 |
149 | psrad mm6, 31 |
150 | pxor mm7, mm6 |
151 | psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3) |
152 | paddd mm1, mm7 ; mm1 = total_error_2:total_error_3 |
153 | movq mm6, mm5 ; mm6 = :error_4 |
154 | psrad mm5, 31 |
155 | pxor mm6, mm5 |
156 | psubd mm6, mm5 ; mm6 = :abs(error_4) |
157 | paddd mm2, mm6 ; mm2 = :total_error_4 |
158 | |
159 | dec ecx |
160 | jnz short .loop |
161 | |
162 | ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) |
163 | ; order = 0; |
164 | ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) |
165 | ; order = 1; |
166 | ; else if(total_error_2 < min(total_error_3, total_error_4)) |
167 | ; order = 2; |
168 | ; else if(total_error_3 < total_error_4) |
169 | ; order = 3; |
170 | ; else |
171 | ; order = 4; |
172 | movq mm3, mm0 ; mm3 = total_error_1:total_error_0 |
173 | movd edi, mm2 ; edi = total_error_4 |
174 | movd esi, mm1 ; esi = total_error_3 |
175 | movd eax, mm0 ; eax = total_error_0 |
176 | punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2 |
177 | punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1 |
178 | movd edx, mm1 ; edx = total_error_2 |
179 | movd ecx, mm3 ; ecx = total_error_1 |
180 | |
181 | xor ebx, ebx |
182 | xor ebp, ebp |
183 | inc ebx |
184 | cmp ecx, eax |
185 | cmovb eax, ecx ; eax = min(total_error_0, total_error_1) |
186 | cmovbe ebp, ebx |
187 | inc ebx |
188 | cmp edx, eax |
189 | cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) |
190 | cmovbe ebp, ebx |
191 | inc ebx |
192 | cmp esi, eax |
193 | cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) |
194 | cmovbe ebp, ebx |
195 | inc ebx |
196 | cmp edi, eax |
197 | cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) |
198 | cmovbe ebp, ebx |
199 | movd ebx, mm0 ; ebx = total_error_0 |
200 | emms |
201 | |
202 | ; residual_bits_per_sample[0] = (float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); |
203 | ; residual_bits_per_sample[1] = (float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); |
204 | ; residual_bits_per_sample[2] = (float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); |
205 | ; residual_bits_per_sample[3] = (float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); |
206 | ; residual_bits_per_sample[4] = (float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); |
207 | xor eax, eax |
208 | fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) |
209 | .rbps_0: |
210 | test ebx, ebx |
211 | jz .total_error_0_is_0 |
212 | fld1 ; ST = 1.0 data_len |
213 | mov [esp], ebx |
214 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0 |
215 | mov ebx, [esp + 44] |
216 | fild qword [esp] ; ST = total_error_0 1.0 data_len |
217 | fdiv st2 ; ST = total_error_0/data_len 1.0 data_len |
218 | fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len |
219 | fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len |
220 | fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len |
221 | fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len |
222 | jmp short .rbps_1 |
223 | .total_error_0_is_0: |
224 | mov ebx, [esp + 44] |
225 | mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 |
226 | .rbps_1: |
227 | test ecx, ecx |
228 | jz .total_error_1_is_0 |
229 | fld1 ; ST = 1.0 data_len |
230 | mov [esp], ecx |
231 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1 |
232 | fild qword [esp] ; ST = total_error_1 1.0 data_len |
233 | fdiv st2 ; ST = total_error_1/data_len 1.0 data_len |
234 | fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len |
235 | fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len |
236 | fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len |
237 | fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len |
238 | jmp short .rbps_2 |
239 | .total_error_1_is_0: |
240 | mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 |
241 | .rbps_2: |
242 | test edx, edx |
243 | jz .total_error_2_is_0 |
244 | fld1 ; ST = 1.0 data_len |
245 | mov [esp], edx |
246 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2 |
247 | fild qword [esp] ; ST = total_error_2 1.0 data_len |
248 | fdiv st2 ; ST = total_error_2/data_len 1.0 data_len |
249 | fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len |
250 | fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len |
251 | fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len |
252 | fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len |
253 | jmp short .rbps_3 |
254 | .total_error_2_is_0: |
255 | mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 |
256 | .rbps_3: |
257 | test esi, esi |
258 | jz .total_error_3_is_0 |
259 | fld1 ; ST = 1.0 data_len |
260 | mov [esp], esi |
261 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3 |
262 | fild qword [esp] ; ST = total_error_3 1.0 data_len |
263 | fdiv st2 ; ST = total_error_3/data_len 1.0 data_len |
264 | fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len |
265 | fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len |
266 | fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len |
267 | fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len |
268 | jmp short .rbps_4 |
269 | .total_error_3_is_0: |
270 | mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 |
271 | .rbps_4: |
272 | test edi, edi |
273 | jz .total_error_4_is_0 |
274 | fld1 ; ST = 1.0 data_len |
275 | mov [esp], edi |
276 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4 |
277 | fild qword [esp] ; ST = total_error_4 1.0 data_len |
278 | fdiv st2 ; ST = total_error_4/data_len 1.0 data_len |
279 | fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len |
280 | fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len |
281 | fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len |
282 | fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len |
283 | jmp short .rbps_end |
284 | .total_error_4_is_0: |
285 | mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 |
286 | .rbps_end: |
287 | fstp st0 ; ST = [empty] |
288 | jmp short .end |
289 | .data_len_is_0: |
290 | ; data_len == 0, so residual_bits_per_sample[*] = 0.0 |
291 | xor ebp, ebp |
292 | mov edi, [esp + 44] |
293 | mov [edi], ebp |
294 | mov [edi + 4], ebp |
295 | mov [edi + 8], ebp |
296 | mov [edi + 12], ebp |
297 | mov [edi + 16], ebp |
298 | add ebp, byte 4 ; order = 4 |
299 | |
300 | .end: |
301 | mov eax, ebp ; return order |
302 | add esp, byte 16 |
303 | pop edi |
304 | pop esi |
305 | pop ebx |
306 | pop ebp |
307 | ret |
308 | |
309 | ; end |