1 ; vim:filetype=nasm ts=8
3 ; libFLAC - Free Lossless Audio Codec library
4 ; Copyright (C) 2001-2009 Josh Coalson
5 ; Copyright (C) 2011-2016 Xiph.Org Foundation
7 ; Redistribution and use in source and binary forms, with or without
8 ; modification, are permitted provided that the following conditions
11 ; - Redistributions of source code must retain the above copyright
12 ; notice, this list of conditions and the following disclaimer.
14 ; - Redistributions in binary form must reproduce the above copyright
15 ; notice, this list of conditions and the following disclaimer in the
16 ; documentation and/or other materials provided with the distribution.
18 ; - Neither the name of the Xiph.org Foundation nor the names of its
19 ; contributors may be used to endorse or promote products derived from
20 ; this software without specific prior written permission.
22 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
26 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
40 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
41 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
42 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
43 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
44 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
45 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
46 cglobal FLAC__lpc_restore_signal_asm_ia32
47 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
48 cglobal FLAC__lpc_restore_signal_wide_asm_ia32
52 ; **********************************************************************
54 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
57 ; unsigned sample, coeff;
58 ; const unsigned limit = data_len - lag;
60 ; FLAC__ASSERT(lag > 0);
61 ; FLAC__ASSERT(lag <= data_len);
63 ; for(coeff = 0; coeff < lag; coeff++)
65 ; for(sample = 0; sample <= limit; sample++) {
67 ; for(coeff = 0; coeff < lag; coeff++)
68 ; autoc[coeff] += d * data[sample+coeff];
70 ; for(; sample < data_len; sample++) {
72 ; for(coeff = 0; coeff < data_len - sample; coeff++)
73 ; autoc[coeff] += d * data[sample+coeff];
78 cident FLAC__lpc_compute_autocorrelation_asm_ia32
79 ;[esp + 28] == autoc[]
81 ;[esp + 20] == data_len
86 ;ASSERT(lag <= data_len)
93 ; for(coeff = 0; coeff < lag; coeff++)
95 mov edi, [esp + 28] ; edi == autoc
96 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
100 ; const unsigned limit = data_len - lag;
101 mov eax, [esp + 24] ; eax == lag
103 sub ecx, eax ; ecx == limit
105 mov edi, [esp + 28] ; edi == autoc
106 mov esi, [esp + 16] ; esi == data
107 inc ecx ; we are looping <= limit so we add one to the counter
109 ; for(sample = 0; sample <= limit; sample++) {
111 ; for(coeff = 0; coeff < lag; coeff++)
112 ; autoc[coeff] += d * data[sample+coeff];
114 fld dword [esi] ; ST = d <- data[sample]
115 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
116 lea edx, [eax + eax*2]
118 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
122 inc edx ; compensate for the shorter opcode on the last iteration
123 inc edx ; compensate for the shorter opcode on the last iteration
124 inc edx ; compensate for the shorter opcode on the last iteration
127 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
136 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
137 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
138 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
140 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
141 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
142 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
144 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
145 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
146 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
148 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
149 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
150 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
152 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
153 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
154 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
156 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
157 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
158 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
160 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
161 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
162 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
164 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
165 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
166 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
168 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
169 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
170 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
172 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
173 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
174 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
176 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
177 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
178 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
180 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
181 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
182 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
184 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
185 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
186 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
188 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
189 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
190 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
192 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
193 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
194 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
196 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
197 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
198 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
200 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
201 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
202 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
204 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
205 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
206 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
208 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
209 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
210 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
212 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
213 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
214 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
216 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
217 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
218 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
220 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
221 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
222 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
224 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
225 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
226 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
228 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
229 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
230 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
232 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
233 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
234 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
236 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
237 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
238 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
240 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
241 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
242 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
244 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
245 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
246 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
248 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
249 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
250 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
252 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
253 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
254 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
256 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
257 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
258 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
260 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
261 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
262 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
264 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
265 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
266 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
269 fstp st0 ; pop d, ST = empty
270 add esi, byte 4 ; sample++
273 fld dword [esi] ; ST = d <- data[sample]
277 ; for(; sample < data_len; sample++) {
279 ; for(coeff = 0; coeff < data_len - sample; coeff++)
280 ; autoc[coeff] += d * data[sample+coeff];
282 mov ecx, [esp + 24] ; ecx <- lag
283 dec ecx ; ecx <- lag - 1
284 jz near .end ; skip loop if 0 (i.e. lag == 1)
286 fld dword [esi] ; ST = d <- data[sample]
287 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
288 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
289 lea edx, [eax + eax*2]
291 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
295 inc edx ; compensate for the shorter opcode on the last iteration
296 inc edx ; compensate for the shorter opcode on the last iteration
297 inc edx ; compensate for the shorter opcode on the last iteration
301 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
302 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
303 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
305 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
306 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
307 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
309 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
310 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
311 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
313 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
314 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
315 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
317 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
318 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
319 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
321 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
322 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
323 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
325 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
326 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
327 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
329 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
330 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
331 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
333 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
334 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
335 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
337 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
338 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
339 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
341 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
342 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
343 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
345 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
346 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
347 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
349 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
350 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
351 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
353 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
354 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
355 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
357 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
358 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
359 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
361 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
362 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
363 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
365 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
366 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
367 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
369 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
370 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
371 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
373 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
374 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
375 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
377 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
378 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
379 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
381 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
382 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
383 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
385 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
386 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
387 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
389 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
390 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
391 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
393 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
394 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
395 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
397 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
398 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
399 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
401 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
402 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
403 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
405 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
406 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
407 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
409 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
410 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
411 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
413 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
414 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
415 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
417 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
418 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
419 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
421 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
422 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
423 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
425 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
426 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
427 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
430 fstp st0 ; pop d, ST = empty
431 add esi, byte 4 ; sample++
434 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
435 fld dword [esi] ; ST = d <- data[sample]
446 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
447 ;[esp + 16] == autoc[]
449 ;[esp + 8] == data_len
454 ;ASSERT(lag <= data_len)
456 ; for(coeff = 0; coeff < lag; coeff++)
457 ; autoc[coeff] = 0.0;
460 mov edx, [esp + 8] ; edx == data_len
461 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
463 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
465 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
466 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
467 .warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
468 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
469 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
474 ; start by reading the next sample
475 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
477 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
478 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
480 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
481 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
486 mov edx, [esp + 16] ; edx == autoc
493 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
494 ;[esp + 16] == autoc[]
496 ;[esp + 8] == data_len
501 ;ASSERT(lag <= data_len)
503 ; for(coeff = 0; coeff < lag; coeff++)
504 ; autoc[coeff] = 0.0;
508 mov edx, [esp + 8] ; edx == data_len
509 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
511 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
513 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
514 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
515 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
516 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
517 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
519 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
521 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
526 ; start by reading the next sample
527 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
528 ; here we reorder the instructions; see the (#) indexes for a logical order
529 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
531 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
532 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
533 movss xmm3, xmm2 ; (5)
534 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
535 movss xmm2, xmm0 ; (6)
536 mulps xmm1, xmm3 ; (8)
537 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
538 addps xmm6, xmm1 ; (10)
539 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
544 mov edx, [esp + 16] ; edx == autoc
546 movups [edx + 16], xmm6
552 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
553 ;[esp + 16] == autoc[]
555 ;[esp + 8] == data_len
560 ;ASSERT(lag <= data_len)
562 ; for(coeff = 0; coeff < lag; coeff++)
563 ; autoc[coeff] = 0.0;
568 mov edx, [esp + 8] ; edx == data_len
569 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
571 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
573 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
574 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
575 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
576 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
577 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
585 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
590 ; start by reading the next sample
591 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
593 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
595 ; shift xmm4:xmm3:xmm2 left by one float
596 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
597 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
598 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
603 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
617 mov edx, [esp + 16] ; edx == autoc
619 movups [edx + 16], xmm6
620 movups [edx + 32], xmm7
626 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
627 ;[ebp + 20] == autoc[]
629 ;[ebp + 12] == data_len
632 ;[esp + 16] == __m128
636 and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
641 ;ASSERT(lag <= data_len)
642 ;ASSERT(data_len > 0)
644 ; for(coeff = 0; coeff < lag; coeff++)
645 ; autoc[coeff] = 0.0;
649 movaps [esp + 16], xmm6
651 mov edx, [ebp + 12] ; edx == data_len
652 mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0]
654 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
656 movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0]
657 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
658 xorps xmm2, xmm2 ; xmm2 = 0,0,0,0
659 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
660 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
668 ; start by reading the next sample
669 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
671 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
673 ; shift xmm4:xmm3:xmm2:xmm1 left by one float
674 shufps xmm1, xmm1, 93h
675 shufps xmm2, xmm2, 93h
676 shufps xmm3, xmm3, 93h
677 shufps xmm4, xmm4, 93h
683 ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
694 addps xmm0, [esp + 16]
696 movaps [esp + 16], xmm0
702 mov edx, [ebp + 20] ; edx == autoc
704 movups [edx + 16], xmm6
706 movaps xmm6, [esp + 16]
707 movups [edx + 32], xmm5
708 movups [edx + 48], xmm6
714 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
716 ; for(i = 0; i < data_len; i++) {
718 ; for(j = 0; j < order; j++)
719 ; sum += qlp_coeff[j] * data[i-j-1];
720 ; residual[i] = data[i] - (sum >> lp_quantization);
724 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
725 ;[esp + 40] residual[]
726 ;[esp + 36] lp_quantization
728 ;[esp + 28] qlp_coeff[]
739 mov esi, [esp + 20] ; esi = data[]
740 mov edi, [esp + 40] ; edi = residual[]
741 mov eax, [esp + 32] ; eax = order
742 mov ebx, [esp + 24] ; ebx = data_len
745 jz near .end ; do nothing if data_len == 0
751 mov edx, [ecx] ; edx = qlp_coeff[0]
752 mov eax, [esi - 4] ; eax = data[-1]
753 mov ecx, [esp + 36] ; cl = lp_quantization
770 cmp eax, byte 32 ; for order <= 32 there is a faster routine
773 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
786 imul eax, [esi + 4 * ecx]
789 jnz short .i_32more_loop_j
811 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
816 mov eax, [esp + 28] ; eax = qlp_coeff[]
821 imul ecx, [esi - 128]
824 imul ecx, [esi - 124]
827 imul ecx, [esi - 120]
830 imul ecx, [esi - 116]
833 imul ecx, [esi - 112]
836 imul ecx, [esi - 108]
839 imul ecx, [esi - 104]
842 imul ecx, [esi - 100]
913 mov ecx, [eax] ; there is one byte missing
937 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
938 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine
939 ; cannot be used for side-channel coded 16bps channels since the effective bps
942 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
943 ;[esp + 40] residual[]
944 ;[esp + 36] lp_quantization
946 ;[esp + 28] qlp_coeff[]
957 mov esi, [esp + 20] ; esi = data[]
958 mov edi, [esp + 40] ; edi = residual[]
959 mov eax, [esp + 32] ; eax = order
960 mov ebx, [esp + 24] ; ebx = data_len
963 jz near .end ; do nothing if data_len == 0
968 mov edx, [esp + 28] ; edx = qlp_coeff[]
969 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
976 push word [edx + 4 * ecx]
979 jnz short .copy_qlp_loop
992 movq mm5, [esp + 2 * eax - 8]
994 punpckldq mm4, [esi - 12]
996 punpckldq mm0, [esi - 4]
1000 jnbe short .mmx_4more
1006 punpckldq mm1, [esi + 4]
1044 punpckldq mm1, [esi + 4]
1065 movd mm0, [ecx - 16]
1067 punpckldq mm0, [ecx - 12]
1068 punpckldq mm7, [ecx - 4]
1073 movd mm0, [ecx - 12]
1074 punpckldq mm0, [ecx - 8]
1075 punpckldq mm7, [ecx]
1083 jnz .mmx_4more_loop_j
1099 jg near .mmx_4more_loop_i
1107 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1116 ; **********************************************************************
1118 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1123 ; FLAC__ASSERT(order > 0);
1125 ; for(i = 0; i < data_len; i++) {
1127 ; for(j = 0; j < order; j++)
1128 ; sum += qlp_coeff[j] * data[i-j-1];
1129 ; data[i] = residual[i] + (sum >> lp_quantization);
1133 cident FLAC__lpc_restore_signal_asm_ia32
1135 ;[esp + 36] lp_quantization
1137 ;[esp + 28] qlp_coeff[]
1138 ;[esp + 24] data_len
1139 ;[esp + 20] residual[]
1148 mov esi, [esp + 20] ; esi = residual[]
1149 mov edi, [esp + 40] ; edi = data[]
1150 mov eax, [esp + 32] ; eax = order
1151 mov ebx, [esp + 24] ; ebx = data_len
1154 jz near .end ; do nothing if data_len == 0
1178 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1181 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1194 imul eax, [edi + 4 * ecx]
1197 jnz short .x87_32more_loop_j
1207 jnz .x87_32more_loop_i
1218 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1219 call .mov_eip_to_eax
1222 inc edx ; compensate for the shorter opcode on the last iteration
1223 mov eax, [esp + 28] ; eax = qlp_coeff[]
1227 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1228 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1229 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1230 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1231 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1232 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1233 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1234 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1235 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1236 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1237 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1238 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1239 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1240 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1241 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1242 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1243 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1244 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1245 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1246 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1247 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1248 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1249 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1250 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1251 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1252 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1253 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1254 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1255 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1256 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1257 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1258 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1259 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1260 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1261 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1262 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1263 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1264 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1265 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1266 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1267 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1268 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1269 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1270 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1271 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1272 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1273 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1274 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1275 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1276 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1277 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1278 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1279 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1280 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1281 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1282 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1283 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1284 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1285 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1286 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1287 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1288 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1289 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1290 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1291 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1292 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1293 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1294 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1295 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1296 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1297 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1298 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1299 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1300 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1301 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1302 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1303 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1304 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1305 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1306 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1307 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1308 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1309 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1310 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1311 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1312 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1313 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1314 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1315 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1316 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1317 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1318 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1319 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1320 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1321 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1322 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1326 sar ebp, cl ; ebp = (sum >> lp_quantization)
1327 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1328 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1343 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1344 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine
1345 ; cannot be used for side-channel coded 16bps channels since the effective bps
1347 ; WATCHOUT: this routine requires that each data array have a buffer of up to
1348 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1349 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1351 cident FLAC__lpc_restore_signal_asm_ia32_mmx
1353 ;[esp + 36] lp_quantization
1355 ;[esp + 28] qlp_coeff[]
1356 ;[esp + 24] data_len
1357 ;[esp + 20] residual[]
1372 jz near .end ; do nothing if data_len == 0
1374 jb near FLAC__lpc_restore_signal_asm_ia32.begin
1377 movd mm6, [esp + 36]
1384 push word [edx + 4 * ecx]
1387 jnz short .copy_qlp_loop
1400 movq mm5, [esp + 2 * eax - 8]
1401 movd mm4, [edi - 16]
1402 punpckldq mm4, [edi - 12]
1404 punpckldq mm0, [edi - 4]
1408 jnbe short .mmx_4more
1446 movd mm0, [ecx - 16]
1447 punpckldq mm0, [ecx - 12]
1449 punpckldq mm1, [ecx - 4]
1457 jnz .mmx_4more_loop_j
1474 jnz short .mmx_4more_loop_i
1487 ; **********************************************************************
1489 ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
1494 ; FLAC__ASSERT(order > 0);
1496 ; for(i = 0; i < data_len; i++) {
1498 ; for(j = 0; j < order; j++)
1499 ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1500 ; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1504 cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
1505 ;[esp + 40] residual[]
1506 ;[esp + 36] lp_quantization
1508 ;[esp + 28] qlp_coeff[]
1509 ;[esp + 24] data_len
1513 ;ASSERT(order <= 32)
1514 ;ASSERT(lp_quantization <= 31)
1521 mov ebx, [esp + 24] ; ebx = data_len
1523 jz near .end ; do nothing if data_len == 0
1526 mov eax, [esp + 32] ; eax = order
1530 mov esi, [esp + 40] ; esi = residual[]
1531 mov edi, [esp + 20] ; edi = data[]
1532 mov ecx, [esp + 28] ; ecx = qlp_coeff[]
1533 mov ebp, [ecx] ; ebp = qlp_coeff[0]
1534 mov eax, [edi - 4] ; eax = data[-1]
1535 mov ecx, [esp + 36] ; cl = lp_quantization
1538 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1539 shrd eax, edx, cl ; 0 <= lp_quantization <= 15
1554 .i_32: ; eax = order
1557 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1558 call .mov_eip_to_eax
1561 inc ebp ; compensate for the shorter opcode on the last iteration
1563 mov ebx, [esp + 28] ; ebx = qlp_coeff[]
1564 mov edi, [esp + 20] ; edi = data[]
1565 sub [esp + 40], edi ; residual[] -= data[]
1580 mov eax, [ebx + 124] ; eax = qlp_coeff[31]
1581 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
1583 adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
1585 mov eax, [ebx + 120] ; eax = qlp_coeff[30]
1586 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
1588 adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
1590 mov eax, [ebx + 116]
1591 imul dword [edi - 120]
1595 mov eax, [ebx + 112]
1596 imul dword [edi - 116]
1600 mov eax, [ebx + 108]
1601 imul dword [edi - 112]
1605 mov eax, [ebx + 104]
1606 imul dword [edi - 108]
1610 mov eax, [ebx + 100]
1611 imul dword [edi - 104]
1616 imul dword [edi - 100]
1621 imul dword [edi - 96]
1626 imul dword [edi - 92]
1631 imul dword [edi - 88]
1636 imul dword [edi - 84]
1641 imul dword [edi - 80]
1646 imul dword [edi - 76]
1651 imul dword [edi - 72]
1656 imul dword [edi - 68]
1661 imul dword [edi - 64]
1666 imul dword [edi - 60]
1671 imul dword [edi - 56]
1676 imul dword [edi - 52]
1681 imul dword [edi - 48]
1686 imul dword [edi - 44]
1691 imul dword [edi - 40]
1696 imul dword [edi - 36]
1701 imul dword [edi - 32]
1706 imul dword [edi - 28]
1711 imul dword [edi - 24]
1716 imul dword [edi - 20]
1721 imul dword [edi - 16]
1726 imul dword [edi - 12]
1731 imul dword [edi - 8]
1735 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1736 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
1738 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
1743 mov ecx, [esp + 36] ; cl = lp_quantization
1744 shrd edx, esi, cl ; edx = (sum >> lp_quantization)
1749 neg edx ; edx = -(sum >> lp_quantization)
1750 mov eax, [esp + 40] ; residual[] - data[]
1751 add edx, [edi] ; edx = data[i] - (sum >> lp_quantization)
1752 mov [edi + eax], edx
1755 dec dword [esp + 24]
1768 ; **********************************************************************
1770 ; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1775 ; FLAC__ASSERT(order > 0);
1777 ; for(i = 0; i < data_len; i++) {
1779 ; for(j = 0; j < order; j++)
1780 ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1781 ; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1785 cident FLAC__lpc_restore_signal_wide_asm_ia32
1787 ;[esp + 36] lp_quantization
1789 ;[esp + 28] qlp_coeff[]
1790 ;[esp + 24] data_len
1791 ;[esp + 20] residual[]
1794 ;ASSERT(order <= 32)
1795 ;ASSERT(lp_quantization <= 31)
1802 mov ebx, [esp + 24] ; ebx = data_len
1804 jz near .end ; do nothing if data_len == 0
1807 mov eax, [esp + 32] ; eax = order
1811 mov esi, [esp + 20] ; esi = residual[]
1812 mov edi, [esp + 40] ; edi = data[]
1813 mov ecx, [esp + 28] ; ecx = qlp_coeff[]
1814 mov ebp, [ecx] ; ebp = qlp_coeff[0]
1815 mov eax, [edi - 4] ; eax = data[-1]
1816 mov ecx, [esp + 36] ; cl = lp_quantization
1819 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1820 shrd eax, edx, cl ; 0 <= lp_quantization <= 15
1835 .x87_32: ; eax = order
1838 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1839 call .mov_eip_to_eax
1842 inc ebp ; compensate for the shorter opcode on the last iteration
1844 mov ebx, [esp + 28] ; ebx = qlp_coeff[]
1845 mov edi, [esp + 40] ; esi = data[]
1846 sub [esp + 20], edi ; residual[] -= data[]
1861 mov eax, [ebx + 124] ; eax = qlp_coeff[31]
1862 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
1864 adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
1866 mov eax, [ebx + 120] ; eax = qlp_coeff[30]
1867 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
1869 adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
1871 mov eax, [ebx + 116]
1872 imul dword [edi - 120]
1876 mov eax, [ebx + 112]
1877 imul dword [edi - 116]
1881 mov eax, [ebx + 108]
1882 imul dword [edi - 112]
1886 mov eax, [ebx + 104]
1887 imul dword [edi - 108]
1891 mov eax, [ebx + 100]
1892 imul dword [edi - 104]
1897 imul dword [edi - 100]
1902 imul dword [edi - 96]
1907 imul dword [edi - 92]
1912 imul dword [edi - 88]
1917 imul dword [edi - 84]
1922 imul dword [edi - 80]
1927 imul dword [edi - 76]
1932 imul dword [edi - 72]
1937 imul dword [edi - 68]
1942 imul dword [edi - 64]
1947 imul dword [edi - 60]
1952 imul dword [edi - 56]
1957 imul dword [edi - 52]
1962 imul dword [edi - 48]
1967 imul dword [edi - 44]
1972 imul dword [edi - 40]
1977 imul dword [edi - 36]
1982 imul dword [edi - 32]
1987 imul dword [edi - 28]
1992 imul dword [edi - 24]
1997 imul dword [edi - 20]
2002 imul dword [edi - 16]
2007 imul dword [edi - 12]
2012 imul dword [edi - 8]
2016 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
2017 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
2019 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
2024 mov ecx, [esp + 36] ; cl = lp_quantization
2025 shrd edx, esi, cl ; edx = (sum >> lp_quantization)
2031 mov eax, [esp + 20] ; residual[] - data[]
2032 add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization)
2033 mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization)
2036 dec dword [esp + 24]