Sync to latest upstream
[pcsx_rearmed.git] / deps / flac-1.3.2 / src / libFLAC / ia32 / lpc_asm.nasm
CommitLineData
ce188d4d 1; vim:filetype=nasm ts=8
2
3; libFLAC - Free Lossless Audio Codec library
4; Copyright (C) 2001-2009 Josh Coalson
5; Copyright (C) 2011-2016 Xiph.Org Foundation
6;
7; Redistribution and use in source and binary forms, with or without
8; modification, are permitted provided that the following conditions
9; are met:
10;
11; - Redistributions of source code must retain the above copyright
12; notice, this list of conditions and the following disclaimer.
13;
14; - Redistributions in binary form must reproduce the above copyright
15; notice, this list of conditions and the following disclaimer in the
16; documentation and/or other materials provided with the distribution.
17;
18; - Neither the name of the Xiph.org Foundation nor the names of its
19; contributors may be used to endorse or promote products derived from
20; this software without specific prior written permission.
21;
22; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
26; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34%include "nasm.h"
35
36 data_section
37
38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
42cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
43cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
44cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
45cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
46cglobal FLAC__lpc_restore_signal_asm_ia32
47cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
48cglobal FLAC__lpc_restore_signal_wide_asm_ia32
49
50 code_section
51
52; **********************************************************************
53;
54; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
55; {
56; FLAC__real d;
57; unsigned sample, coeff;
58; const unsigned limit = data_len - lag;
59;
60; FLAC__ASSERT(lag > 0);
61; FLAC__ASSERT(lag <= data_len);
62;
63; for(coeff = 0; coeff < lag; coeff++)
64; autoc[coeff] = 0.0;
65; for(sample = 0; sample <= limit; sample++) {
66; d = data[sample];
67; for(coeff = 0; coeff < lag; coeff++)
68; autoc[coeff] += d * data[sample+coeff];
69; }
70; for(; sample < data_len; sample++) {
71; d = data[sample];
72; for(coeff = 0; coeff < data_len - sample; coeff++)
73; autoc[coeff] += d * data[sample+coeff];
74; }
75; }
76;
77 ALIGN 16
78cident FLAC__lpc_compute_autocorrelation_asm_ia32
79 ;[esp + 28] == autoc[]
80 ;[esp + 24] == lag
81 ;[esp + 20] == data_len
82 ;[esp + 16] == data[]
83
84 ;ASSERT(lag > 0)
85 ;ASSERT(lag <= 33)
86 ;ASSERT(lag <= data_len)
87
88.begin:
89 push esi
90 push edi
91 push ebx
92
93 ; for(coeff = 0; coeff < lag; coeff++)
94 ; autoc[coeff] = 0.0;
95 mov edi, [esp + 28] ; edi == autoc
96 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
97 xor eax, eax
98 rep stosd
99
100 ; const unsigned limit = data_len - lag;
101 mov eax, [esp + 24] ; eax == lag
102 mov ecx, [esp + 20]
103 sub ecx, eax ; ecx == limit
104
105 mov edi, [esp + 28] ; edi == autoc
106 mov esi, [esp + 16] ; esi == data
107 inc ecx ; we are looping <= limit so we add one to the counter
108
109 ; for(sample = 0; sample <= limit; sample++) {
110 ; d = data[sample];
111 ; for(coeff = 0; coeff < lag; coeff++)
112 ; autoc[coeff] += d * data[sample+coeff];
113 ; }
114 fld dword [esi] ; ST = d <- data[sample]
115 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
116 lea edx, [eax + eax*2]
117 neg edx
118 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
119 call .mov_eip_to_ebx
120.get_eip1:
121 add edx, ebx
122 inc edx ; compensate for the shorter opcode on the last iteration
123 inc edx ; compensate for the shorter opcode on the last iteration
124 inc edx ; compensate for the shorter opcode on the last iteration
125 cmp eax, 33
126 jne .loop1_start
127 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
128.loop1_start:
129 jmp edx
130
131.mov_eip_to_ebx:
132 mov ebx, [esp]
133 ret
134
135 fld st0 ; ST = d d
136 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
137 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
138 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
139 fld st0 ; ST = d d
140 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
141 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
142 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
143 fld st0 ; ST = d d
144 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
145 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
146 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
147 fld st0 ; ST = d d
148 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
149 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
150 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
151 fld st0 ; ST = d d
152 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
153 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
154 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
155 fld st0 ; ST = d d
156 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
157 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
158 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
159 fld st0 ; ST = d d
160 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
161 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
162 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
163 fld st0 ; ST = d d
164 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
165 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
166 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
167 fld st0 ; ST = d d
168 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
169 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
170 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
171 fld st0 ; ST = d d
172 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
173 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
174 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
175 fld st0 ; ST = d d
176 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
177 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
178 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
179 fld st0 ; ST = d d
180 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
181 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
182 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
183 fld st0 ; ST = d d
184 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
185 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
186 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
187 fld st0 ; ST = d d
188 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
189 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
190 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
191 fld st0 ; ST = d d
192 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
193 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
194 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
195 fld st0 ; ST = d d
196 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
197 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
198 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
199 fld st0 ; ST = d d
200 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
201 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
202 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
203 fld st0 ; ST = d d
204 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
205 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
206 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
207 fld st0 ; ST = d d
208 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
209 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
210 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
211 fld st0 ; ST = d d
212 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
213 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
214 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
215 fld st0 ; ST = d d
216 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
217 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
218 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
219 fld st0 ; ST = d d
220 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
221 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
222 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
223 fld st0 ; ST = d d
224 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
225 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
226 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
227 fld st0 ; ST = d d
228 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
229 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
230 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
231 fld st0 ; ST = d d
232 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
233 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
234 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
235 fld st0 ; ST = d d
236 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
237 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
238 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
239 fld st0 ; ST = d d
240 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
241 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
242 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
243 fld st0 ; ST = d d
244 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
245 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
246 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
247 fld st0 ; ST = d d
248 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
249 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
250 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
251 fld st0 ; ST = d d
252 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
253 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
254 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
255 fld st0 ; ST = d d
256 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
257 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
258 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
259 fld st0 ; ST = d d
260 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
261 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
262 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
263 fld st0 ; ST = d d
264 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
265 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
266 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
267.jumper1_0:
268
269 fstp st0 ; pop d, ST = empty
270 add esi, byte 4 ; sample++
271 dec ecx
272 jz .loop1_end
273 fld dword [esi] ; ST = d <- data[sample]
274 jmp edx
275.loop1_end:
276
277 ; for(; sample < data_len; sample++) {
278 ; d = data[sample];
279 ; for(coeff = 0; coeff < data_len - sample; coeff++)
280 ; autoc[coeff] += d * data[sample+coeff];
281 ; }
282 mov ecx, [esp + 24] ; ecx <- lag
283 dec ecx ; ecx <- lag - 1
284 jz near .end ; skip loop if 0 (i.e. lag == 1)
285
286 fld dword [esi] ; ST = d <- data[sample]
287 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
288 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
289 lea edx, [eax + eax*2]
290 neg edx
291 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
292 call .mov_eip_to_ebx
293.get_eip2:
294 add edx, ebx
295 inc edx ; compensate for the shorter opcode on the last iteration
296 inc edx ; compensate for the shorter opcode on the last iteration
297 inc edx ; compensate for the shorter opcode on the last iteration
298 jmp edx
299
300 fld st0 ; ST = d d
301 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
302 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
303 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
304 fld st0 ; ST = d d
305 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
306 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
307 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
308 fld st0 ; ST = d d
309 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
310 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
311 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
312 fld st0 ; ST = d d
313 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
314 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
315 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
316 fld st0 ; ST = d d
317 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
318 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
319 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
320 fld st0 ; ST = d d
321 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
322 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
323 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
324 fld st0 ; ST = d d
325 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
326 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
327 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
328 fld st0 ; ST = d d
329 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
330 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
331 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
332 fld st0 ; ST = d d
333 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
334 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
335 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
336 fld st0 ; ST = d d
337 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
338 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
339 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
340 fld st0 ; ST = d d
341 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
342 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
343 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
344 fld st0 ; ST = d d
345 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
346 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
347 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
348 fld st0 ; ST = d d
349 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
350 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
351 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
352 fld st0 ; ST = d d
353 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
354 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
355 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
356 fld st0 ; ST = d d
357 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
358 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
359 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
360 fld st0 ; ST = d d
361 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
362 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
363 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
364 fld st0 ; ST = d d
365 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
366 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
367 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
368 fld st0 ; ST = d d
369 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
370 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
371 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
372 fld st0 ; ST = d d
373 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
374 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
375 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
376 fld st0 ; ST = d d
377 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
378 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
379 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
380 fld st0 ; ST = d d
381 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
382 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
383 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
384 fld st0 ; ST = d d
385 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
386 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
387 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
388 fld st0 ; ST = d d
389 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
390 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
391 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
392 fld st0 ; ST = d d
393 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
394 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
395 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
396 fld st0 ; ST = d d
397 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
398 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
399 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
400 fld st0 ; ST = d d
401 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
402 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
403 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
404 fld st0 ; ST = d d
405 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
406 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
407 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
408 fld st0 ; ST = d d
409 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
410 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
411 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
412 fld st0 ; ST = d d
413 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
414 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
415 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
416 fld st0 ; ST = d d
417 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
418 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
419 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
420 fld st0 ; ST = d d
421 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
422 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
423 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
424 fld st0 ; ST = d d
425 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
426 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
427 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
428.jumper2_0:
429
430 fstp st0 ; pop d, ST = empty
431 add esi, byte 4 ; sample++
432 dec ecx
433 jz .loop2_end
434 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
435 fld dword [esi] ; ST = d <- data[sample]
436 jmp edx
437.loop2_end:
438
439.end:
440 pop ebx
441 pop edi
442 pop esi
443 ret
444
445 ALIGN 16
446cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
447 ;[esp + 16] == autoc[]
448 ;[esp + 12] == lag
449 ;[esp + 8] == data_len
450 ;[esp + 4] == data[]
451
452 ;ASSERT(lag > 0)
453 ;ASSERT(lag <= 4)
454 ;ASSERT(lag <= data_len)
455
456 ; for(coeff = 0; coeff < lag; coeff++)
457 ; autoc[coeff] = 0.0;
458 xorps xmm5, xmm5
459
460 mov edx, [esp + 8] ; edx == data_len
461 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
462
463 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
464 add eax, 4
465 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
466 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
467.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
468 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
469 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
470 dec edx
471 jz .loop_end
472 ALIGN 16
473.loop_start:
474 ; start by reading the next sample
475 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
476 add eax, 4
477 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
478 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
479 movss xmm2, xmm0
480 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
481 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
482 dec edx
483 jnz .loop_start
484.loop_end:
485 ; store autoc
486 mov edx, [esp + 16] ; edx == autoc
487 movups [edx], xmm5
488
489.end:
490 ret
491
492 ALIGN 16
493cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
494 ;[esp + 16] == autoc[]
495 ;[esp + 12] == lag
496 ;[esp + 8] == data_len
497 ;[esp + 4] == data[]
498
499 ;ASSERT(lag > 0)
500 ;ASSERT(lag <= 8)
501 ;ASSERT(lag <= data_len)
502
503 ; for(coeff = 0; coeff < lag; coeff++)
504 ; autoc[coeff] = 0.0;
505 xorps xmm5, xmm5
506 xorps xmm6, xmm6
507
508 mov edx, [esp + 8] ; edx == data_len
509 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
510
511 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
512 add eax, 4
513 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
514 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
515 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
516 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
517.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
518 mulps xmm0, xmm2
519 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
520 addps xmm5, xmm0
521 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
522 dec edx
523 jz .loop_end
524 ALIGN 16
525.loop_start:
526 ; start by reading the next sample
527 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
528 ; here we reorder the instructions; see the (#) indexes for a logical order
529 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
530 add eax, 4 ; (0)
531 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
532 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
533 movss xmm3, xmm2 ; (5)
534 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
535 movss xmm2, xmm0 ; (6)
536 mulps xmm1, xmm3 ; (8)
537 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
538 addps xmm6, xmm1 ; (10)
539 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
540 dec edx
541 jnz .loop_start
542.loop_end:
543 ; store autoc
544 mov edx, [esp + 16] ; edx == autoc
545 movups [edx], xmm5
546 movups [edx + 16], xmm6
547
548.end:
549 ret
550
551 ALIGN 16
552cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
553 ;[esp + 16] == autoc[]
554 ;[esp + 12] == lag
555 ;[esp + 8] == data_len
556 ;[esp + 4] == data[]
557
558 ;ASSERT(lag > 0)
559 ;ASSERT(lag <= 12)
560 ;ASSERT(lag <= data_len)
561
562 ; for(coeff = 0; coeff < lag; coeff++)
563 ; autoc[coeff] = 0.0;
564 xorps xmm5, xmm5
565 xorps xmm6, xmm6
566 xorps xmm7, xmm7
567
568 mov edx, [esp + 8] ; edx == data_len
569 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
570
571 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
572 add eax, 4
573 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
574 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
575 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
576 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
577.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
578 movaps xmm1, xmm0
579 mulps xmm1, xmm2
580 addps xmm5, xmm1
581 movaps xmm1, xmm0
582 mulps xmm1, xmm3
583 addps xmm6, xmm1
584 mulps xmm0, xmm4
585 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
586 dec edx
587 jz .loop_end
588 ALIGN 16
589.loop_start:
590 ; start by reading the next sample
591 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
592 add eax, 4
593 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
594
595 ; shift xmm4:xmm3:xmm2 left by one float
596 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
597 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
598 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
599 movss xmm4, xmm3
600 movss xmm3, xmm2
601 movss xmm2, xmm0
602
603 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
604 movaps xmm1, xmm0
605 mulps xmm1, xmm2
606 addps xmm5, xmm1
607 movaps xmm1, xmm0
608 mulps xmm1, xmm3
609 addps xmm6, xmm1
610 mulps xmm0, xmm4
611 addps xmm7, xmm0
612
613 dec edx
614 jnz .loop_start
615.loop_end:
616 ; store autoc
617 mov edx, [esp + 16] ; edx == autoc
618 movups [edx], xmm5
619 movups [edx + 16], xmm6
620 movups [edx + 32], xmm7
621
622.end:
623 ret
624
625 ALIGN 16
626cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
627 ;[ebp + 20] == autoc[]
628 ;[ebp + 16] == lag
629 ;[ebp + 12] == data_len
630 ;[ebp + 8] == data[]
631 ;[esp] == __m128
632 ;[esp + 16] == __m128
633
634 push ebp
635 mov ebp, esp
636 and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
637 sub esp, 32
638
639 ;ASSERT(lag > 0)
640 ;ASSERT(lag <= 12)
641 ;ASSERT(lag <= data_len)
642 ;ASSERT(data_len > 0)
643
644 ; for(coeff = 0; coeff < lag; coeff++)
645 ; autoc[coeff] = 0.0;
646 xorps xmm5, xmm5
647 xorps xmm6, xmm6
648 movaps [esp], xmm5
649 movaps [esp + 16], xmm6
650
651 mov edx, [ebp + 12] ; edx == data_len
652 mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0]
653
654 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
655 add eax, 4
656 movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0]
657 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
658 xorps xmm2, xmm2 ; xmm2 = 0,0,0,0
659 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
660 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
661 movaps xmm7, xmm0
662 mulps xmm7, xmm1
663 addps xmm5, xmm7
664 dec edx
665 jz .loop_end
666 ALIGN 16
667.loop_start:
668 ; start by reading the next sample
669 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
670 add eax, 4
671 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
672
673 ; shift xmm4:xmm3:xmm2:xmm1 left by one float
674 shufps xmm1, xmm1, 93h
675 shufps xmm2, xmm2, 93h
676 shufps xmm3, xmm3, 93h
677 shufps xmm4, xmm4, 93h
678 movss xmm4, xmm3
679 movss xmm3, xmm2
680 movss xmm2, xmm1
681 movss xmm1, xmm0
682
683 ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
684 movaps xmm7, xmm0
685 mulps xmm7, xmm1
686 addps xmm5, xmm7
687 movaps xmm7, xmm0
688 mulps xmm7, xmm2
689 addps xmm6, xmm7
690 movaps xmm7, xmm0
691 mulps xmm7, xmm3
692 mulps xmm0, xmm4
693 addps xmm7, [esp]
694 addps xmm0, [esp + 16]
695 movaps [esp], xmm7
696 movaps [esp + 16], xmm0
697
698 dec edx
699 jnz .loop_start
700.loop_end:
701 ; store autoc
702 mov edx, [ebp + 20] ; edx == autoc
703 movups [edx], xmm5
704 movups [edx + 16], xmm6
705 movaps xmm5, [esp]
706 movaps xmm6, [esp + 16]
707 movups [edx + 32], xmm5
708 movups [edx + 48], xmm6
709.end:
710 mov esp, ebp
711 pop ebp
712 ret
713
714;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
715;
716; for(i = 0; i < data_len; i++) {
717; sum = 0;
718; for(j = 0; j < order; j++)
719; sum += qlp_coeff[j] * data[i-j-1];
720; residual[i] = data[i] - (sum >> lp_quantization);
721; }
722;
723 ALIGN 16
724cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
725 ;[esp + 40] residual[]
726 ;[esp + 36] lp_quantization
727 ;[esp + 32] order
728 ;[esp + 28] qlp_coeff[]
729 ;[esp + 24] data_len
730 ;[esp + 20] data[]
731
732 ;ASSERT(order > 0)
733
734 push ebp
735 push ebx
736 push esi
737 push edi
738
739 mov esi, [esp + 20] ; esi = data[]
740 mov edi, [esp + 40] ; edi = residual[]
741 mov eax, [esp + 32] ; eax = order
742 mov ebx, [esp + 24] ; ebx = data_len
743
744 test ebx, ebx
745 jz near .end ; do nothing if data_len == 0
746.begin:
747 cmp eax, byte 1
748 jg short .i_1more
749
750 mov ecx, [esp + 28]
751 mov edx, [ecx] ; edx = qlp_coeff[0]
752 mov eax, [esi - 4] ; eax = data[-1]
753 mov ecx, [esp + 36] ; cl = lp_quantization
754 ALIGN 16
755.i_1_loop_i:
756 imul eax, edx
757 sar eax, cl
758 neg eax
759 add eax, [esi]
760 mov [edi], eax
761 mov eax, [esi]
762 add edi, byte 4
763 add esi, byte 4
764 dec ebx
765 jnz .i_1_loop_i
766
767 jmp .end
768
769.i_1more:
770 cmp eax, byte 32 ; for order <= 32 there is a faster routine
771 jbe short .i_32
772
773 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
774 ALIGN 16
775.i_32more_loop_i:
776 xor ebp, ebp
777 mov ecx, [esp + 32]
778 mov edx, ecx
779 shl edx, 2
780 add edx, [esp + 28]
781 neg ecx
782 ALIGN 16
783.i_32more_loop_j:
784 sub edx, byte 4
785 mov eax, [edx]
786 imul eax, [esi + 4 * ecx]
787 add ebp, eax
788 inc ecx
789 jnz short .i_32more_loop_j
790
791 mov ecx, [esp + 36]
792 sar ebp, cl
793 neg ebp
794 add ebp, [esi]
795 mov [edi], ebp
796 add esi, byte 4
797 add edi, byte 4
798
799 dec ebx
800 jnz .i_32more_loop_i
801
802 jmp .end
803
804.mov_eip_to_eax:
805 mov eax, [esp]
806 ret
807
808.i_32:
809 sub edi, esi
810 neg eax
811 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
812 call .mov_eip_to_eax
813.get_eip0:
814 add edx, eax
815 inc edx
816 mov eax, [esp + 28] ; eax = qlp_coeff[]
817 xor ebp, ebp
818 jmp edx
819
820 mov ecx, [eax + 124]
821 imul ecx, [esi - 128]
822 add ebp, ecx
823 mov ecx, [eax + 120]
824 imul ecx, [esi - 124]
825 add ebp, ecx
826 mov ecx, [eax + 116]
827 imul ecx, [esi - 120]
828 add ebp, ecx
829 mov ecx, [eax + 112]
830 imul ecx, [esi - 116]
831 add ebp, ecx
832 mov ecx, [eax + 108]
833 imul ecx, [esi - 112]
834 add ebp, ecx
835 mov ecx, [eax + 104]
836 imul ecx, [esi - 108]
837 add ebp, ecx
838 mov ecx, [eax + 100]
839 imul ecx, [esi - 104]
840 add ebp, ecx
841 mov ecx, [eax + 96]
842 imul ecx, [esi - 100]
843 add ebp, ecx
844 mov ecx, [eax + 92]
845 imul ecx, [esi - 96]
846 add ebp, ecx
847 mov ecx, [eax + 88]
848 imul ecx, [esi - 92]
849 add ebp, ecx
850 mov ecx, [eax + 84]
851 imul ecx, [esi - 88]
852 add ebp, ecx
853 mov ecx, [eax + 80]
854 imul ecx, [esi - 84]
855 add ebp, ecx
856 mov ecx, [eax + 76]
857 imul ecx, [esi - 80]
858 add ebp, ecx
859 mov ecx, [eax + 72]
860 imul ecx, [esi - 76]
861 add ebp, ecx
862 mov ecx, [eax + 68]
863 imul ecx, [esi - 72]
864 add ebp, ecx
865 mov ecx, [eax + 64]
866 imul ecx, [esi - 68]
867 add ebp, ecx
868 mov ecx, [eax + 60]
869 imul ecx, [esi - 64]
870 add ebp, ecx
871 mov ecx, [eax + 56]
872 imul ecx, [esi - 60]
873 add ebp, ecx
874 mov ecx, [eax + 52]
875 imul ecx, [esi - 56]
876 add ebp, ecx
877 mov ecx, [eax + 48]
878 imul ecx, [esi - 52]
879 add ebp, ecx
880 mov ecx, [eax + 44]
881 imul ecx, [esi - 48]
882 add ebp, ecx
883 mov ecx, [eax + 40]
884 imul ecx, [esi - 44]
885 add ebp, ecx
886 mov ecx, [eax + 36]
887 imul ecx, [esi - 40]
888 add ebp, ecx
889 mov ecx, [eax + 32]
890 imul ecx, [esi - 36]
891 add ebp, ecx
892 mov ecx, [eax + 28]
893 imul ecx, [esi - 32]
894 add ebp, ecx
895 mov ecx, [eax + 24]
896 imul ecx, [esi - 28]
897 add ebp, ecx
898 mov ecx, [eax + 20]
899 imul ecx, [esi - 24]
900 add ebp, ecx
901 mov ecx, [eax + 16]
902 imul ecx, [esi - 20]
903 add ebp, ecx
904 mov ecx, [eax + 12]
905 imul ecx, [esi - 16]
906 add ebp, ecx
907 mov ecx, [eax + 8]
908 imul ecx, [esi - 12]
909 add ebp, ecx
910 mov ecx, [eax + 4]
911 imul ecx, [esi - 8]
912 add ebp, ecx
913 mov ecx, [eax] ; there is one byte missing
914 imul ecx, [esi - 4]
915 add ebp, ecx
916.jumper_0:
917
918 mov ecx, [esp + 36]
919 sar ebp, cl
920 neg ebp
921 add ebp, [esi]
922 mov [edi + esi], ebp
923 add esi, byte 4
924
925 dec ebx
926 jz short .end
927 xor ebp, ebp
928 jmp edx
929
930.end:
931 pop edi
932 pop esi
933 pop ebx
934 pop ebp
935 ret
936
937; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
938; the channel and qlp_coeffs must be <= 16. Especially note that this routine
939; cannot be used for side-channel coded 16bps channels since the effective bps
940; is 17.
941 ALIGN 16
942cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
943 ;[esp + 40] residual[]
944 ;[esp + 36] lp_quantization
945 ;[esp + 32] order
946 ;[esp + 28] qlp_coeff[]
947 ;[esp + 24] data_len
948 ;[esp + 20] data[]
949
950 ;ASSERT(order > 0)
951
952 push ebp
953 push ebx
954 push esi
955 push edi
956
957 mov esi, [esp + 20] ; esi = data[]
958 mov edi, [esp + 40] ; edi = residual[]
959 mov eax, [esp + 32] ; eax = order
960 mov ebx, [esp + 24] ; ebx = data_len
961
962 test ebx, ebx
963 jz near .end ; do nothing if data_len == 0
964 dec ebx
965 test ebx, ebx
966 jz near .last_one
967
968 mov edx, [esp + 28] ; edx = qlp_coeff[]
969 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
970 mov ebp, esp
971
972 and esp, 0xfffffff8
973
974 xor ecx, ecx
975.copy_qlp_loop:
976 push word [edx + 4 * ecx]
977 inc ecx
978 cmp ecx, eax
979 jnz short .copy_qlp_loop
980
981 and ecx, 0x3
982 test ecx, ecx
983 je short .za_end
984 sub ecx, byte 4
985.za_loop:
986 push word 0
987 inc eax
988 inc ecx
989 jnz short .za_loop
990.za_end:
991
992 movq mm5, [esp + 2 * eax - 8]
993 movd mm4, [esi - 16]
994 punpckldq mm4, [esi - 12]
995 movd mm0, [esi - 8]
996 punpckldq mm0, [esi - 4]
997 packssdw mm4, mm0
998
999 cmp eax, byte 4
1000 jnbe short .mmx_4more
1001
1002 ALIGN 16
1003.mmx_4_loop_i:
1004 movd mm1, [esi]
1005 movq mm3, mm4
1006 punpckldq mm1, [esi + 4]
1007 psrlq mm4, 16
1008 movq mm0, mm1
1009 psllq mm0, 48
1010 por mm4, mm0
1011 movq mm2, mm4
1012 psrlq mm4, 16
1013 pxor mm0, mm0
1014 punpckhdq mm0, mm1
1015 pmaddwd mm3, mm5
1016 pmaddwd mm2, mm5
1017 psllq mm0, 16
1018 por mm4, mm0
1019 movq mm0, mm3
1020 punpckldq mm3, mm2
1021 punpckhdq mm0, mm2
1022 paddd mm3, mm0
1023 psrad mm3, mm6
1024 psubd mm1, mm3
1025 movd [edi], mm1
1026 punpckhdq mm1, mm1
1027 movd [edi + 4], mm1
1028
1029 add edi, byte 8
1030 add esi, byte 8
1031
1032 sub ebx, 2
1033 jg .mmx_4_loop_i
1034 jmp .mmx_end
1035
1036.mmx_4more:
1037 shl eax, 2
1038 neg eax
1039 add eax, byte 16
1040
1041 ALIGN 16
1042.mmx_4more_loop_i:
1043 movd mm1, [esi]
1044 punpckldq mm1, [esi + 4]
1045 movq mm3, mm4
1046 psrlq mm4, 16
1047 movq mm0, mm1
1048 psllq mm0, 48
1049 por mm4, mm0
1050 movq mm2, mm4
1051 psrlq mm4, 16
1052 pxor mm0, mm0
1053 punpckhdq mm0, mm1
1054 pmaddwd mm3, mm5
1055 pmaddwd mm2, mm5
1056 psllq mm0, 16
1057 por mm4, mm0
1058
1059 mov ecx, esi
1060 add ecx, eax
1061 mov edx, esp
1062
1063 ALIGN 16
1064.mmx_4more_loop_j:
1065 movd mm0, [ecx - 16]
1066 movd mm7, [ecx - 8]
1067 punpckldq mm0, [ecx - 12]
1068 punpckldq mm7, [ecx - 4]
1069 packssdw mm0, mm7
1070 pmaddwd mm0, [edx]
1071 punpckhdq mm7, mm7
1072 paddd mm3, mm0
1073 movd mm0, [ecx - 12]
1074 punpckldq mm0, [ecx - 8]
1075 punpckldq mm7, [ecx]
1076 packssdw mm0, mm7
1077 pmaddwd mm0, [edx]
1078 paddd mm2, mm0
1079
1080 add edx, byte 8
1081 add ecx, byte 16
1082 cmp ecx, esi
1083 jnz .mmx_4more_loop_j
1084
1085 movq mm0, mm3
1086 punpckldq mm3, mm2
1087 punpckhdq mm0, mm2
1088 paddd mm3, mm0
1089 psrad mm3, mm6
1090 psubd mm1, mm3
1091 movd [edi], mm1
1092 punpckhdq mm1, mm1
1093 movd [edi + 4], mm1
1094
1095 add edi, byte 8
1096 add esi, byte 8
1097
1098 sub ebx, 2
1099 jg near .mmx_4more_loop_i
1100
1101.mmx_end:
1102 emms
1103 mov esp, ebp
1104.last_one:
1105 mov eax, [esp + 32]
1106 inc ebx
1107 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1108
1109.end:
1110 pop edi
1111 pop esi
1112 pop ebx
1113 pop ebp
1114 ret
1115
1116; **********************************************************************
1117;
1118; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1119; {
1120; unsigned i, j;
1121; FLAC__int32 sum;
1122;
1123; FLAC__ASSERT(order > 0);
1124;
1125; for(i = 0; i < data_len; i++) {
1126; sum = 0;
1127; for(j = 0; j < order; j++)
1128; sum += qlp_coeff[j] * data[i-j-1];
1129; data[i] = residual[i] + (sum >> lp_quantization);
1130; }
1131; }
1132 ALIGN 16
1133cident FLAC__lpc_restore_signal_asm_ia32
1134 ;[esp + 40] data[]
1135 ;[esp + 36] lp_quantization
1136 ;[esp + 32] order
1137 ;[esp + 28] qlp_coeff[]
1138 ;[esp + 24] data_len
1139 ;[esp + 20] residual[]
1140
1141 ;ASSERT(order > 0)
1142
1143 push ebp
1144 push ebx
1145 push esi
1146 push edi
1147
1148 mov esi, [esp + 20] ; esi = residual[]
1149 mov edi, [esp + 40] ; edi = data[]
1150 mov eax, [esp + 32] ; eax = order
1151 mov ebx, [esp + 24] ; ebx = data_len
1152
1153 test ebx, ebx
1154 jz near .end ; do nothing if data_len == 0
1155
1156.begin:
1157 cmp eax, byte 1
1158 jg short .x87_1more
1159
1160 mov ecx, [esp + 28]
1161 mov edx, [ecx]
1162 mov eax, [edi - 4]
1163 mov ecx, [esp + 36]
1164 ALIGN 16
1165.x87_1_loop_i:
1166 imul eax, edx
1167 sar eax, cl
1168 add eax, [esi]
1169 mov [edi], eax
1170 add esi, byte 4
1171 add edi, byte 4
1172 dec ebx
1173 jnz .x87_1_loop_i
1174
1175 jmp .end
1176
1177.x87_1more:
1178 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1179 jbe short .x87_32
1180
1181 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1182 ALIGN 16
1183.x87_32more_loop_i:
1184 xor ebp, ebp
1185 mov ecx, [esp + 32]
1186 mov edx, ecx
1187 shl edx, 2
1188 add edx, [esp + 28]
1189 neg ecx
1190 ALIGN 16
1191.x87_32more_loop_j:
1192 sub edx, byte 4
1193 mov eax, [edx]
1194 imul eax, [edi + 4 * ecx]
1195 add ebp, eax
1196 inc ecx
1197 jnz short .x87_32more_loop_j
1198
1199 mov ecx, [esp + 36]
1200 sar ebp, cl
1201 add ebp, [esi]
1202 mov [edi], ebp
1203 add edi, byte 4
1204 add esi, byte 4
1205
1206 dec ebx
1207 jnz .x87_32more_loop_i
1208
1209 jmp .end
1210
1211.mov_eip_to_eax:
1212 mov eax, [esp]
1213 ret
1214
1215.x87_32:
1216 sub esi, edi
1217 neg eax
1218 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1219 call .mov_eip_to_eax
1220.get_eip0:
1221 add edx, eax
1222 inc edx ; compensate for the shorter opcode on the last iteration
1223 mov eax, [esp + 28] ; eax = qlp_coeff[]
1224 xor ebp, ebp
1225 jmp edx
1226
1227 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1228 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1229 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1230 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1231 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1232 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1233 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1234 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1235 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1236 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1237 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1238 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1239 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1240 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1241 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1242 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1243 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1244 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1245 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1246 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1247 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1248 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1249 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1250 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1251 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1252 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1253 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1254 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1255 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1256 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1257 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1258 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1259 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1260 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1261 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1262 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1263 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1264 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1265 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1266 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1267 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1268 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1269 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1270 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1271 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1272 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1273 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1274 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1275 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1276 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1277 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1278 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1279 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1280 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1281 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1282 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1283 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1284 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1285 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1286 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1287 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1288 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1289 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1290 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1291 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1292 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1293 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1294 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1295 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1296 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1297 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1298 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1299 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1300 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1301 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1302 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1303 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1304 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1305 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1306 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1307 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1308 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1309 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1310 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1311 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1312 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1313 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1314 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1315 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1316 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1317 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1318 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1319 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1320 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1321 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1322 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1323.jumper_0:
1324
1325 mov ecx, [esp + 36]
1326 sar ebp, cl ; ebp = (sum >> lp_quantization)
1327 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1328 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1329 add edi, byte 4
1330
1331 dec ebx
1332 jz short .end
1333 xor ebp, ebp
1334 jmp edx
1335
1336.end:
1337 pop edi
1338 pop esi
1339 pop ebx
1340 pop ebp
1341 ret
1342
1343; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1344; the channel and qlp_coeffs must be <= 16. Especially note that this routine
1345; cannot be used for side-channel coded 16bps channels since the effective bps
1346; is 17.
1347; WATCHOUT: this routine requires that each data array have a buffer of up to
1348; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1349; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1350 ALIGN 16
1351cident FLAC__lpc_restore_signal_asm_ia32_mmx
1352 ;[esp + 40] data[]
1353 ;[esp + 36] lp_quantization
1354 ;[esp + 32] order
1355 ;[esp + 28] qlp_coeff[]
1356 ;[esp + 24] data_len
1357 ;[esp + 20] residual[]
1358
1359 ;ASSERT(order > 0)
1360
1361 push ebp
1362 push ebx
1363 push esi
1364 push edi
1365
1366 mov esi, [esp + 20]
1367 mov edi, [esp + 40]
1368 mov eax, [esp + 32]
1369 mov ebx, [esp + 24]
1370
1371 test ebx, ebx
1372 jz near .end ; do nothing if data_len == 0
1373 cmp eax, byte 4
1374 jb near FLAC__lpc_restore_signal_asm_ia32.begin
1375
1376 mov edx, [esp + 28]
1377 movd mm6, [esp + 36]
1378 mov ebp, esp
1379
1380 and esp, 0xfffffff8
1381
1382 xor ecx, ecx
1383.copy_qlp_loop:
1384 push word [edx + 4 * ecx]
1385 inc ecx
1386 cmp ecx, eax
1387 jnz short .copy_qlp_loop
1388
1389 and ecx, 0x3
1390 test ecx, ecx
1391 je short .za_end
1392 sub ecx, byte 4
1393.za_loop:
1394 push word 0
1395 inc eax
1396 inc ecx
1397 jnz short .za_loop
1398.za_end:
1399
1400 movq mm5, [esp + 2 * eax - 8]
1401 movd mm4, [edi - 16]
1402 punpckldq mm4, [edi - 12]
1403 movd mm0, [edi - 8]
1404 punpckldq mm0, [edi - 4]
1405 packssdw mm4, mm0
1406
1407 cmp eax, byte 4
1408 jnbe short .mmx_4more
1409
1410 ALIGN 16
1411.mmx_4_loop_i:
1412 movq mm7, mm4
1413 pmaddwd mm7, mm5
1414 movq mm0, mm7
1415 punpckhdq mm7, mm7
1416 paddd mm7, mm0
1417 psrad mm7, mm6
1418 movd mm1, [esi]
1419 paddd mm7, mm1
1420 movd [edi], mm7
1421 psllq mm7, 48
1422 psrlq mm4, 16
1423 por mm4, mm7
1424
1425 add esi, byte 4
1426 add edi, byte 4
1427
1428 dec ebx
1429 jnz .mmx_4_loop_i
1430 jmp .mmx_end
1431.mmx_4more:
1432 shl eax, 2
1433 neg eax
1434 add eax, byte 16
1435 ALIGN 16
1436.mmx_4more_loop_i:
1437 mov ecx, edi
1438 add ecx, eax
1439 mov edx, esp
1440
1441 movq mm7, mm4
1442 pmaddwd mm7, mm5
1443
1444 ALIGN 16
1445.mmx_4more_loop_j:
1446 movd mm0, [ecx - 16]
1447 punpckldq mm0, [ecx - 12]
1448 movd mm1, [ecx - 8]
1449 punpckldq mm1, [ecx - 4]
1450 packssdw mm0, mm1
1451 pmaddwd mm0, [edx]
1452 paddd mm7, mm0
1453
1454 add edx, byte 8
1455 add ecx, byte 16
1456 cmp ecx, edi
1457 jnz .mmx_4more_loop_j
1458
1459 movq mm0, mm7
1460 punpckhdq mm7, mm7
1461 paddd mm7, mm0
1462 psrad mm7, mm6
1463 movd mm1, [esi]
1464 paddd mm7, mm1
1465 movd [edi], mm7
1466 psllq mm7, 48
1467 psrlq mm4, 16
1468 por mm4, mm7
1469
1470 add esi, byte 4
1471 add edi, byte 4
1472
1473 dec ebx
1474 jnz short .mmx_4more_loop_i
1475.mmx_end:
1476 emms
1477 mov esp, ebp
1478
1479.end:
1480 pop edi
1481 pop esi
1482 pop ebx
1483 pop ebp
1484 ret
1485
1486
1487; **********************************************************************
1488;
1489;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
1490; {
1491; unsigned i, j;
1492; FLAC__int64 sum;
1493;
1494; FLAC__ASSERT(order > 0);
1495;
1496; for(i = 0; i < data_len; i++) {
1497; sum = 0;
1498; for(j = 0; j < order; j++)
1499; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1500; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1501; }
1502; }
1503 ALIGN 16
1504cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
1505 ;[esp + 40] residual[]
1506 ;[esp + 36] lp_quantization
1507 ;[esp + 32] order
1508 ;[esp + 28] qlp_coeff[]
1509 ;[esp + 24] data_len
1510 ;[esp + 20] data[]
1511
1512 ;ASSERT(order > 0)
1513 ;ASSERT(order <= 32)
1514 ;ASSERT(lp_quantization <= 31)
1515
1516 push ebp
1517 push ebx
1518 push esi
1519 push edi
1520
1521 mov ebx, [esp + 24] ; ebx = data_len
1522 test ebx, ebx
1523 jz near .end ; do nothing if data_len == 0
1524
1525.begin:
1526 mov eax, [esp + 32] ; eax = order
1527 cmp eax, 1
1528 jg short .i_32
1529
1530 mov esi, [esp + 40] ; esi = residual[]
1531 mov edi, [esp + 20] ; edi = data[]
1532 mov ecx, [esp + 28] ; ecx = qlp_coeff[]
1533 mov ebp, [ecx] ; ebp = qlp_coeff[0]
1534 mov eax, [edi - 4] ; eax = data[-1]
1535 mov ecx, [esp + 36] ; cl = lp_quantization
1536 ALIGN 16
1537.i_1_loop_i:
1538 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1539 shrd eax, edx, cl ; 0 <= lp_quantization <= 15
1540 neg eax
1541 add eax, [edi]
1542 mov [esi], eax
1543 mov eax, [edi]
1544 add esi, 4
1545 add edi, 4
1546 dec ebx
1547 jnz .i_1_loop_i
1548 jmp .end
1549
1550.mov_eip_to_eax:
1551 mov eax, [esp]
1552 ret
1553
1554.i_32: ; eax = order
1555 neg eax
1556 add eax, eax
1557 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1558 call .mov_eip_to_eax
1559.get_eip0:
1560 add ebp, eax
1561 inc ebp ; compensate for the shorter opcode on the last iteration
1562
1563 mov ebx, [esp + 28] ; ebx = qlp_coeff[]
1564 mov edi, [esp + 20] ; edi = data[]
1565 sub [esp + 40], edi ; residual[] -= data[]
1566
1567 xor ecx, ecx
1568 xor esi, esi
1569 jmp ebp
1570
1571;eax = --
1572;edx = --
1573;ecx = 0
1574;esi = 0
1575;
1576;ebx = qlp_coeff[]
1577;edi = data[]
1578;ebp = @address
1579
1580 mov eax, [ebx + 124] ; eax = qlp_coeff[31]
1581 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
1582 add ecx, eax
1583 adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
1584
1585 mov eax, [ebx + 120] ; eax = qlp_coeff[30]
1586 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
1587 add ecx, eax
1588 adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
1589
1590 mov eax, [ebx + 116]
1591 imul dword [edi - 120]
1592 add ecx, eax
1593 adc esi, edx
1594
1595 mov eax, [ebx + 112]
1596 imul dword [edi - 116]
1597 add ecx, eax
1598 adc esi, edx
1599
1600 mov eax, [ebx + 108]
1601 imul dword [edi - 112]
1602 add ecx, eax
1603 adc esi, edx
1604
1605 mov eax, [ebx + 104]
1606 imul dword [edi - 108]
1607 add ecx, eax
1608 adc esi, edx
1609
1610 mov eax, [ebx + 100]
1611 imul dword [edi - 104]
1612 add ecx, eax
1613 adc esi, edx
1614
1615 mov eax, [ebx + 96]
1616 imul dword [edi - 100]
1617 add ecx, eax
1618 adc esi, edx
1619
1620 mov eax, [ebx + 92]
1621 imul dword [edi - 96]
1622 add ecx, eax
1623 adc esi, edx
1624
1625 mov eax, [ebx + 88]
1626 imul dword [edi - 92]
1627 add ecx, eax
1628 adc esi, edx
1629
1630 mov eax, [ebx + 84]
1631 imul dword [edi - 88]
1632 add ecx, eax
1633 adc esi, edx
1634
1635 mov eax, [ebx + 80]
1636 imul dword [edi - 84]
1637 add ecx, eax
1638 adc esi, edx
1639
1640 mov eax, [ebx + 76]
1641 imul dword [edi - 80]
1642 add ecx, eax
1643 adc esi, edx
1644
1645 mov eax, [ebx + 72]
1646 imul dword [edi - 76]
1647 add ecx, eax
1648 adc esi, edx
1649
1650 mov eax, [ebx + 68]
1651 imul dword [edi - 72]
1652 add ecx, eax
1653 adc esi, edx
1654
1655 mov eax, [ebx + 64]
1656 imul dword [edi - 68]
1657 add ecx, eax
1658 adc esi, edx
1659
1660 mov eax, [ebx + 60]
1661 imul dword [edi - 64]
1662 add ecx, eax
1663 adc esi, edx
1664
1665 mov eax, [ebx + 56]
1666 imul dword [edi - 60]
1667 add ecx, eax
1668 adc esi, edx
1669
1670 mov eax, [ebx + 52]
1671 imul dword [edi - 56]
1672 add ecx, eax
1673 adc esi, edx
1674
1675 mov eax, [ebx + 48]
1676 imul dword [edi - 52]
1677 add ecx, eax
1678 adc esi, edx
1679
1680 mov eax, [ebx + 44]
1681 imul dword [edi - 48]
1682 add ecx, eax
1683 adc esi, edx
1684
1685 mov eax, [ebx + 40]
1686 imul dword [edi - 44]
1687 add ecx, eax
1688 adc esi, edx
1689
1690 mov eax, [ebx + 36]
1691 imul dword [edi - 40]
1692 add ecx, eax
1693 adc esi, edx
1694
1695 mov eax, [ebx + 32]
1696 imul dword [edi - 36]
1697 add ecx, eax
1698 adc esi, edx
1699
1700 mov eax, [ebx + 28]
1701 imul dword [edi - 32]
1702 add ecx, eax
1703 adc esi, edx
1704
1705 mov eax, [ebx + 24]
1706 imul dword [edi - 28]
1707 add ecx, eax
1708 adc esi, edx
1709
1710 mov eax, [ebx + 20]
1711 imul dword [edi - 24]
1712 add ecx, eax
1713 adc esi, edx
1714
1715 mov eax, [ebx + 16]
1716 imul dword [edi - 20]
1717 add ecx, eax
1718 adc esi, edx
1719
1720 mov eax, [ebx + 12]
1721 imul dword [edi - 16]
1722 add ecx, eax
1723 adc esi, edx
1724
1725 mov eax, [ebx + 8]
1726 imul dword [edi - 12]
1727 add ecx, eax
1728 adc esi, edx
1729
1730 mov eax, [ebx + 4]
1731 imul dword [edi - 8]
1732 add ecx, eax
1733 adc esi, edx
1734
1735 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1736 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
1737 add ecx, eax
1738 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
1739
1740.jumper_0:
1741 mov edx, ecx
1742;esi:edx = sum
1743 mov ecx, [esp + 36] ; cl = lp_quantization
1744 shrd edx, esi, cl ; edx = (sum >> lp_quantization)
1745;eax = --
1746;ecx = --
1747;edx = sum >> lp_q
1748;esi = --
1749 neg edx ; edx = -(sum >> lp_quantization)
1750 mov eax, [esp + 40] ; residual[] - data[]
1751 add edx, [edi] ; edx = data[i] - (sum >> lp_quantization)
1752 mov [edi + eax], edx
1753 add edi, 4
1754
1755 dec dword [esp + 24]
1756 jz short .end
1757 xor ecx, ecx
1758 xor esi, esi
1759 jmp ebp
1760
1761.end:
1762 pop edi
1763 pop esi
1764 pop ebx
1765 pop ebp
1766 ret
1767
1768; **********************************************************************
1769;
1770; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1771; {
1772; unsigned i, j;
1773; FLAC__int64 sum;
1774;
1775; FLAC__ASSERT(order > 0);
1776;
1777; for(i = 0; i < data_len; i++) {
1778; sum = 0;
1779; for(j = 0; j < order; j++)
1780; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1781; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1782; }
1783; }
1784 ALIGN 16
1785cident FLAC__lpc_restore_signal_wide_asm_ia32
1786 ;[esp + 40] data[]
1787 ;[esp + 36] lp_quantization
1788 ;[esp + 32] order
1789 ;[esp + 28] qlp_coeff[]
1790 ;[esp + 24] data_len
1791 ;[esp + 20] residual[]
1792
1793 ;ASSERT(order > 0)
1794 ;ASSERT(order <= 32)
1795 ;ASSERT(lp_quantization <= 31)
1796
1797 push ebp
1798 push ebx
1799 push esi
1800 push edi
1801
1802 mov ebx, [esp + 24] ; ebx = data_len
1803 test ebx, ebx
1804 jz near .end ; do nothing if data_len == 0
1805
1806.begin:
1807 mov eax, [esp + 32] ; eax = order
1808 cmp eax, 1
1809 jg short .x87_32
1810
1811 mov esi, [esp + 20] ; esi = residual[]
1812 mov edi, [esp + 40] ; edi = data[]
1813 mov ecx, [esp + 28] ; ecx = qlp_coeff[]
1814 mov ebp, [ecx] ; ebp = qlp_coeff[0]
1815 mov eax, [edi - 4] ; eax = data[-1]
1816 mov ecx, [esp + 36] ; cl = lp_quantization
1817 ALIGN 16
1818.x87_1_loop_i:
1819 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1820 shrd eax, edx, cl ; 0 <= lp_quantization <= 15
1821;
1822 add eax, [esi]
1823 mov [edi], eax
1824;
1825 add esi, 4
1826 add edi, 4
1827 dec ebx
1828 jnz .x87_1_loop_i
1829 jmp .end
1830
1831.mov_eip_to_eax:
1832 mov eax, [esp]
1833 ret
1834
1835.x87_32: ; eax = order
1836 neg eax
1837 add eax, eax
1838 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1839 call .mov_eip_to_eax
1840.get_eip0:
1841 add ebp, eax
1842 inc ebp ; compensate for the shorter opcode on the last iteration
1843
1844 mov ebx, [esp + 28] ; ebx = qlp_coeff[]
1845 mov edi, [esp + 40] ; esi = data[]
1846 sub [esp + 20], edi ; residual[] -= data[]
1847
1848 xor ecx, ecx
1849 xor esi, esi
1850 jmp ebp
1851
1852;eax = --
1853;edx = --
1854;ecx = 0
1855;esi = 0
1856;
1857;ebx = qlp_coeff[]
1858;edi = data[]
1859;ebp = @address
1860
1861 mov eax, [ebx + 124] ; eax = qlp_coeff[31]
1862 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
1863 add ecx, eax
1864 adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
1865
1866 mov eax, [ebx + 120] ; eax = qlp_coeff[30]
1867 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
1868 add ecx, eax
1869 adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
1870
1871 mov eax, [ebx + 116]
1872 imul dword [edi - 120]
1873 add ecx, eax
1874 adc esi, edx
1875
1876 mov eax, [ebx + 112]
1877 imul dword [edi - 116]
1878 add ecx, eax
1879 adc esi, edx
1880
1881 mov eax, [ebx + 108]
1882 imul dword [edi - 112]
1883 add ecx, eax
1884 adc esi, edx
1885
1886 mov eax, [ebx + 104]
1887 imul dword [edi - 108]
1888 add ecx, eax
1889 adc esi, edx
1890
1891 mov eax, [ebx + 100]
1892 imul dword [edi - 104]
1893 add ecx, eax
1894 adc esi, edx
1895
1896 mov eax, [ebx + 96]
1897 imul dword [edi - 100]
1898 add ecx, eax
1899 adc esi, edx
1900
1901 mov eax, [ebx + 92]
1902 imul dword [edi - 96]
1903 add ecx, eax
1904 adc esi, edx
1905
1906 mov eax, [ebx + 88]
1907 imul dword [edi - 92]
1908 add ecx, eax
1909 adc esi, edx
1910
1911 mov eax, [ebx + 84]
1912 imul dword [edi - 88]
1913 add ecx, eax
1914 adc esi, edx
1915
1916 mov eax, [ebx + 80]
1917 imul dword [edi - 84]
1918 add ecx, eax
1919 adc esi, edx
1920
1921 mov eax, [ebx + 76]
1922 imul dword [edi - 80]
1923 add ecx, eax
1924 adc esi, edx
1925
1926 mov eax, [ebx + 72]
1927 imul dword [edi - 76]
1928 add ecx, eax
1929 adc esi, edx
1930
1931 mov eax, [ebx + 68]
1932 imul dword [edi - 72]
1933 add ecx, eax
1934 adc esi, edx
1935
1936 mov eax, [ebx + 64]
1937 imul dword [edi - 68]
1938 add ecx, eax
1939 adc esi, edx
1940
1941 mov eax, [ebx + 60]
1942 imul dword [edi - 64]
1943 add ecx, eax
1944 adc esi, edx
1945
1946 mov eax, [ebx + 56]
1947 imul dword [edi - 60]
1948 add ecx, eax
1949 adc esi, edx
1950
1951 mov eax, [ebx + 52]
1952 imul dword [edi - 56]
1953 add ecx, eax
1954 adc esi, edx
1955
1956 mov eax, [ebx + 48]
1957 imul dword [edi - 52]
1958 add ecx, eax
1959 adc esi, edx
1960
1961 mov eax, [ebx + 44]
1962 imul dword [edi - 48]
1963 add ecx, eax
1964 adc esi, edx
1965
1966 mov eax, [ebx + 40]
1967 imul dword [edi - 44]
1968 add ecx, eax
1969 adc esi, edx
1970
1971 mov eax, [ebx + 36]
1972 imul dword [edi - 40]
1973 add ecx, eax
1974 adc esi, edx
1975
1976 mov eax, [ebx + 32]
1977 imul dword [edi - 36]
1978 add ecx, eax
1979 adc esi, edx
1980
1981 mov eax, [ebx + 28]
1982 imul dword [edi - 32]
1983 add ecx, eax
1984 adc esi, edx
1985
1986 mov eax, [ebx + 24]
1987 imul dword [edi - 28]
1988 add ecx, eax
1989 adc esi, edx
1990
1991 mov eax, [ebx + 20]
1992 imul dword [edi - 24]
1993 add ecx, eax
1994 adc esi, edx
1995
1996 mov eax, [ebx + 16]
1997 imul dword [edi - 20]
1998 add ecx, eax
1999 adc esi, edx
2000
2001 mov eax, [ebx + 12]
2002 imul dword [edi - 16]
2003 add ecx, eax
2004 adc esi, edx
2005
2006 mov eax, [ebx + 8]
2007 imul dword [edi - 12]
2008 add ecx, eax
2009 adc esi, edx
2010
2011 mov eax, [ebx + 4]
2012 imul dword [edi - 8]
2013 add ecx, eax
2014 adc esi, edx
2015
2016 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
2017 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
2018 add ecx, eax
2019 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
2020
2021.jumper_0:
2022 mov edx, ecx
2023;esi:edx = sum
2024 mov ecx, [esp + 36] ; cl = lp_quantization
2025 shrd edx, esi, cl ; edx = (sum >> lp_quantization)
2026;eax = --
2027;ecx = --
2028;edx = sum >> lp_q
2029;esi = --
2030;
2031 mov eax, [esp + 20] ; residual[] - data[]
2032 add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization)
2033 mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization)
2034 add edi, 4
2035
2036 dec dword [esp + 24]
2037 jz short .end
2038 xor ecx, ecx
2039 xor esi, esi
2040 jmp ebp
2041
2042.end:
2043 pop edi
2044 pop esi
2045 pop ebx
2046 pop ebp
2047 ret
2048
2049; end