ce188d4d |
1 | ; vim:filetype=nasm ts=8 |
2 | |
3 | ; libFLAC - Free Lossless Audio Codec library |
4 | ; Copyright (C) 2001-2009 Josh Coalson |
5 | ; Copyright (C) 2011-2016 Xiph.Org Foundation |
6 | ; |
7 | ; Redistribution and use in source and binary forms, with or without |
8 | ; modification, are permitted provided that the following conditions |
9 | ; are met: |
10 | ; |
11 | ; - Redistributions of source code must retain the above copyright |
12 | ; notice, this list of conditions and the following disclaimer. |
13 | ; |
14 | ; - Redistributions in binary form must reproduce the above copyright |
15 | ; notice, this list of conditions and the following disclaimer in the |
16 | ; documentation and/or other materials provided with the distribution. |
17 | ; |
18 | ; - Neither the name of the Xiph.org Foundation nor the names of its |
19 | ; contributors may be used to endorse or promote products derived from |
20 | ; this software without specific prior written permission. |
21 | ; |
22 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
23 | ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
24 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
25 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
26 | ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
27 | ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
28 | ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
29 | ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
30 | ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
31 | ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
32 | ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
33 | |
34 | %include "nasm.h" |
35 | |
36 | data_section |
37 | |
38 | cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 |
39 | cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old |
40 | cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old |
41 | cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old |
42 | cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old |
43 | cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 |
44 | cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx |
45 | cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 |
46 | cglobal FLAC__lpc_restore_signal_asm_ia32 |
47 | cglobal FLAC__lpc_restore_signal_asm_ia32_mmx |
48 | cglobal FLAC__lpc_restore_signal_wide_asm_ia32 |
49 | |
50 | code_section |
51 | |
52 | ; ********************************************************************** |
53 | ; |
54 | ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
55 | ; { |
56 | ; FLAC__real d; |
57 | ; unsigned sample, coeff; |
58 | ; const unsigned limit = data_len - lag; |
59 | ; |
60 | ; FLAC__ASSERT(lag > 0); |
61 | ; FLAC__ASSERT(lag <= data_len); |
62 | ; |
63 | ; for(coeff = 0; coeff < lag; coeff++) |
64 | ; autoc[coeff] = 0.0; |
65 | ; for(sample = 0; sample <= limit; sample++) { |
66 | ; d = data[sample]; |
67 | ; for(coeff = 0; coeff < lag; coeff++) |
68 | ; autoc[coeff] += d * data[sample+coeff]; |
69 | ; } |
70 | ; for(; sample < data_len; sample++) { |
71 | ; d = data[sample]; |
72 | ; for(coeff = 0; coeff < data_len - sample; coeff++) |
73 | ; autoc[coeff] += d * data[sample+coeff]; |
74 | ; } |
75 | ; } |
76 | ; |
77 | ALIGN 16 |
78 | cident FLAC__lpc_compute_autocorrelation_asm_ia32 |
79 | ;[esp + 28] == autoc[] |
80 | ;[esp + 24] == lag |
81 | ;[esp + 20] == data_len |
82 | ;[esp + 16] == data[] |
83 | |
84 | ;ASSERT(lag > 0) |
85 | ;ASSERT(lag <= 33) |
86 | ;ASSERT(lag <= data_len) |
87 | |
88 | .begin: |
89 | push esi |
90 | push edi |
91 | push ebx |
92 | |
93 | ; for(coeff = 0; coeff < lag; coeff++) |
94 | ; autoc[coeff] = 0.0; |
95 | mov edi, [esp + 28] ; edi == autoc |
96 | mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write |
97 | xor eax, eax |
98 | rep stosd |
99 | |
100 | ; const unsigned limit = data_len - lag; |
101 | mov eax, [esp + 24] ; eax == lag |
102 | mov ecx, [esp + 20] |
103 | sub ecx, eax ; ecx == limit |
104 | |
105 | mov edi, [esp + 28] ; edi == autoc |
106 | mov esi, [esp + 16] ; esi == data |
107 | inc ecx ; we are looping <= limit so we add one to the counter |
108 | |
109 | ; for(sample = 0; sample <= limit; sample++) { |
110 | ; d = data[sample]; |
111 | ; for(coeff = 0; coeff < lag; coeff++) |
112 | ; autoc[coeff] += d * data[sample+coeff]; |
113 | ; } |
114 | fld dword [esi] ; ST = d <- data[sample] |
115 | ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) |
116 | lea edx, [eax + eax*2] |
117 | neg edx |
118 | lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] |
119 | call .mov_eip_to_ebx |
120 | .get_eip1: |
121 | add edx, ebx |
122 | inc edx ; compensate for the shorter opcode on the last iteration |
123 | inc edx ; compensate for the shorter opcode on the last iteration |
124 | inc edx ; compensate for the shorter opcode on the last iteration |
125 | cmp eax, 33 |
126 | jne .loop1_start |
127 | sub edx, byte 9 ; compensate for the longer opcodes on the first iteration |
128 | .loop1_start: |
129 | jmp edx |
130 | |
131 | .mov_eip_to_ebx: |
132 | mov ebx, [esp] |
133 | ret |
134 | |
135 | fld st0 ; ST = d d |
136 | fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! |
137 | fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! |
138 | fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! |
139 | fld st0 ; ST = d d |
140 | fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d |
141 | fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d |
142 | fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d |
143 | fld st0 ; ST = d d |
144 | fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d |
145 | fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d |
146 | fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d |
147 | fld st0 ; ST = d d |
148 | fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d |
149 | fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d |
150 | fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d |
151 | fld st0 ; ST = d d |
152 | fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d |
153 | fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d |
154 | fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d |
155 | fld st0 ; ST = d d |
156 | fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d |
157 | fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d |
158 | fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d |
159 | fld st0 ; ST = d d |
160 | fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d |
161 | fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d |
162 | fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d |
163 | fld st0 ; ST = d d |
164 | fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d |
165 | fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d |
166 | fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d |
167 | fld st0 ; ST = d d |
168 | fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d |
169 | fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d |
170 | fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d |
171 | fld st0 ; ST = d d |
172 | fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d |
173 | fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d |
174 | fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d |
175 | fld st0 ; ST = d d |
176 | fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d |
177 | fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d |
178 | fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d |
179 | fld st0 ; ST = d d |
180 | fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d |
181 | fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d |
182 | fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d |
183 | fld st0 ; ST = d d |
184 | fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d |
185 | fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d |
186 | fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d |
187 | fld st0 ; ST = d d |
188 | fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d |
189 | fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d |
190 | fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d |
191 | fld st0 ; ST = d d |
192 | fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d |
193 | fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d |
194 | fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d |
195 | fld st0 ; ST = d d |
196 | fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d |
197 | fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d |
198 | fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d |
199 | fld st0 ; ST = d d |
200 | fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d |
201 | fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d |
202 | fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d |
203 | fld st0 ; ST = d d |
204 | fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d |
205 | fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d |
206 | fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d |
207 | fld st0 ; ST = d d |
208 | fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d |
209 | fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d |
210 | fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d |
211 | fld st0 ; ST = d d |
212 | fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d |
213 | fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d |
214 | fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d |
215 | fld st0 ; ST = d d |
216 | fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d |
217 | fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d |
218 | fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d |
219 | fld st0 ; ST = d d |
220 | fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d |
221 | fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d |
222 | fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d |
223 | fld st0 ; ST = d d |
224 | fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d |
225 | fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d |
226 | fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d |
227 | fld st0 ; ST = d d |
228 | fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d |
229 | fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d |
230 | fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d |
231 | fld st0 ; ST = d d |
232 | fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d |
233 | fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d |
234 | fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d |
235 | fld st0 ; ST = d d |
236 | fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d |
237 | fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d |
238 | fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d |
239 | fld st0 ; ST = d d |
240 | fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d |
241 | fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d |
242 | fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d |
243 | fld st0 ; ST = d d |
244 | fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d |
245 | fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d |
246 | fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d |
247 | fld st0 ; ST = d d |
248 | fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d |
249 | fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d |
250 | fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d |
251 | fld st0 ; ST = d d |
252 | fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d |
253 | fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d |
254 | fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d |
255 | fld st0 ; ST = d d |
256 | fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d |
257 | fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d |
258 | fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d |
259 | fld st0 ; ST = d d |
260 | fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d |
261 | fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d |
262 | fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d |
263 | fld st0 ; ST = d d |
264 | fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! |
265 | fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! |
266 | fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! |
267 | .jumper1_0: |
268 | |
269 | fstp st0 ; pop d, ST = empty |
270 | add esi, byte 4 ; sample++ |
271 | dec ecx |
272 | jz .loop1_end |
273 | fld dword [esi] ; ST = d <- data[sample] |
274 | jmp edx |
275 | .loop1_end: |
276 | |
277 | ; for(; sample < data_len; sample++) { |
278 | ; d = data[sample]; |
279 | ; for(coeff = 0; coeff < data_len - sample; coeff++) |
280 | ; autoc[coeff] += d * data[sample+coeff]; |
281 | ; } |
282 | mov ecx, [esp + 24] ; ecx <- lag |
283 | dec ecx ; ecx <- lag - 1 |
284 | jz near .end ; skip loop if 0 (i.e. lag == 1) |
285 | |
286 | fld dword [esi] ; ST = d <- data[sample] |
287 | mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through |
288 | ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) |
289 | lea edx, [eax + eax*2] |
290 | neg edx |
291 | lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] |
292 | call .mov_eip_to_ebx |
293 | .get_eip2: |
294 | add edx, ebx |
295 | inc edx ; compensate for the shorter opcode on the last iteration |
296 | inc edx ; compensate for the shorter opcode on the last iteration |
297 | inc edx ; compensate for the shorter opcode on the last iteration |
298 | jmp edx |
299 | |
300 | fld st0 ; ST = d d |
301 | fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d |
302 | fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d |
303 | fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d |
304 | fld st0 ; ST = d d |
305 | fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d |
306 | fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d |
307 | fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d |
308 | fld st0 ; ST = d d |
309 | fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d |
310 | fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d |
311 | fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d |
312 | fld st0 ; ST = d d |
313 | fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d |
314 | fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d |
315 | fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d |
316 | fld st0 ; ST = d d |
317 | fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d |
318 | fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d |
319 | fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d |
320 | fld st0 ; ST = d d |
321 | fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d |
322 | fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d |
323 | fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d |
324 | fld st0 ; ST = d d |
325 | fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d |
326 | fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d |
327 | fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d |
328 | fld st0 ; ST = d d |
329 | fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d |
330 | fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d |
331 | fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d |
332 | fld st0 ; ST = d d |
333 | fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d |
334 | fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d |
335 | fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d |
336 | fld st0 ; ST = d d |
337 | fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d |
338 | fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d |
339 | fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d |
340 | fld st0 ; ST = d d |
341 | fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d |
342 | fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d |
343 | fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d |
344 | fld st0 ; ST = d d |
345 | fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d |
346 | fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d |
347 | fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d |
348 | fld st0 ; ST = d d |
349 | fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d |
350 | fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d |
351 | fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d |
352 | fld st0 ; ST = d d |
353 | fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d |
354 | fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d |
355 | fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d |
356 | fld st0 ; ST = d d |
357 | fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d |
358 | fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d |
359 | fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d |
360 | fld st0 ; ST = d d |
361 | fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d |
362 | fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d |
363 | fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d |
364 | fld st0 ; ST = d d |
365 | fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d |
366 | fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d |
367 | fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d |
368 | fld st0 ; ST = d d |
369 | fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d |
370 | fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d |
371 | fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d |
372 | fld st0 ; ST = d d |
373 | fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d |
374 | fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d |
375 | fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d |
376 | fld st0 ; ST = d d |
377 | fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d |
378 | fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d |
379 | fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d |
380 | fld st0 ; ST = d d |
381 | fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d |
382 | fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d |
383 | fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d |
384 | fld st0 ; ST = d d |
385 | fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d |
386 | fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d |
387 | fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d |
388 | fld st0 ; ST = d d |
389 | fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d |
390 | fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d |
391 | fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d |
392 | fld st0 ; ST = d d |
393 | fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d |
394 | fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d |
395 | fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d |
396 | fld st0 ; ST = d d |
397 | fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d |
398 | fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d |
399 | fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d |
400 | fld st0 ; ST = d d |
401 | fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d |
402 | fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d |
403 | fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d |
404 | fld st0 ; ST = d d |
405 | fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d |
406 | fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d |
407 | fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d |
408 | fld st0 ; ST = d d |
409 | fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d |
410 | fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d |
411 | fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d |
412 | fld st0 ; ST = d d |
413 | fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d |
414 | fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d |
415 | fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d |
416 | fld st0 ; ST = d d |
417 | fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d |
418 | fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d |
419 | fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d |
420 | fld st0 ; ST = d d |
421 | fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d |
422 | fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d |
423 | fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d |
424 | fld st0 ; ST = d d |
425 | fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! |
426 | fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! |
427 | fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! |
428 | .jumper2_0: |
429 | |
430 | fstp st0 ; pop d, ST = empty |
431 | add esi, byte 4 ; sample++ |
432 | dec ecx |
433 | jz .loop2_end |
434 | add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target |
435 | fld dword [esi] ; ST = d <- data[sample] |
436 | jmp edx |
437 | .loop2_end: |
438 | |
439 | .end: |
440 | pop ebx |
441 | pop edi |
442 | pop esi |
443 | ret |
444 | |
445 | ALIGN 16 |
446 | cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old |
447 | ;[esp + 16] == autoc[] |
448 | ;[esp + 12] == lag |
449 | ;[esp + 8] == data_len |
450 | ;[esp + 4] == data[] |
451 | |
452 | ;ASSERT(lag > 0) |
453 | ;ASSERT(lag <= 4) |
454 | ;ASSERT(lag <= data_len) |
455 | |
456 | ; for(coeff = 0; coeff < lag; coeff++) |
457 | ; autoc[coeff] = 0.0; |
458 | xorps xmm5, xmm5 |
459 | |
460 | mov edx, [esp + 8] ; edx == data_len |
461 | mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] |
462 | |
463 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
464 | add eax, 4 |
465 | movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] |
466 | shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
467 | .warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] |
468 | mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 |
469 | addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 |
470 | dec edx |
471 | jz .loop_end |
472 | ALIGN 16 |
473 | .loop_start: |
474 | ; start by reading the next sample |
475 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
476 | add eax, 4 |
477 | shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] |
478 | shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float |
479 | movss xmm2, xmm0 |
480 | mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 |
481 | addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 |
482 | dec edx |
483 | jnz .loop_start |
484 | .loop_end: |
485 | ; store autoc |
486 | mov edx, [esp + 16] ; edx == autoc |
487 | movups [edx], xmm5 |
488 | |
489 | .end: |
490 | ret |
491 | |
492 | ALIGN 16 |
493 | cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old |
494 | ;[esp + 16] == autoc[] |
495 | ;[esp + 12] == lag |
496 | ;[esp + 8] == data_len |
497 | ;[esp + 4] == data[] |
498 | |
499 | ;ASSERT(lag > 0) |
500 | ;ASSERT(lag <= 8) |
501 | ;ASSERT(lag <= data_len) |
502 | |
503 | ; for(coeff = 0; coeff < lag; coeff++) |
504 | ; autoc[coeff] = 0.0; |
505 | xorps xmm5, xmm5 |
506 | xorps xmm6, xmm6 |
507 | |
508 | mov edx, [esp + 8] ; edx == data_len |
509 | mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] |
510 | |
511 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
512 | add eax, 4 |
513 | movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] |
514 | shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
515 | movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
516 | xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 |
517 | .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] |
518 | mulps xmm0, xmm2 |
519 | mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 |
520 | addps xmm5, xmm0 |
521 | addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 |
522 | dec edx |
523 | jz .loop_end |
524 | ALIGN 16 |
525 | .loop_start: |
526 | ; start by reading the next sample |
527 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
528 | ; here we reorder the instructions; see the (#) indexes for a logical order |
529 | shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float |
530 | add eax, 4 ; (0) |
531 | shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float |
532 | shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] |
533 | movss xmm3, xmm2 ; (5) |
534 | movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] |
535 | movss xmm2, xmm0 ; (6) |
536 | mulps xmm1, xmm3 ; (8) |
537 | mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 |
538 | addps xmm6, xmm1 ; (10) |
539 | addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 |
540 | dec edx |
541 | jnz .loop_start |
542 | .loop_end: |
543 | ; store autoc |
544 | mov edx, [esp + 16] ; edx == autoc |
545 | movups [edx], xmm5 |
546 | movups [edx + 16], xmm6 |
547 | |
548 | .end: |
549 | ret |
550 | |
551 | ALIGN 16 |
552 | cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old |
553 | ;[esp + 16] == autoc[] |
554 | ;[esp + 12] == lag |
555 | ;[esp + 8] == data_len |
556 | ;[esp + 4] == data[] |
557 | |
558 | ;ASSERT(lag > 0) |
559 | ;ASSERT(lag <= 12) |
560 | ;ASSERT(lag <= data_len) |
561 | |
562 | ; for(coeff = 0; coeff < lag; coeff++) |
563 | ; autoc[coeff] = 0.0; |
564 | xorps xmm5, xmm5 |
565 | xorps xmm6, xmm6 |
566 | xorps xmm7, xmm7 |
567 | |
568 | mov edx, [esp + 8] ; edx == data_len |
569 | mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] |
570 | |
571 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
572 | add eax, 4 |
573 | movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] |
574 | shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
575 | xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 |
576 | xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 |
577 | .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] |
578 | movaps xmm1, xmm0 |
579 | mulps xmm1, xmm2 |
580 | addps xmm5, xmm1 |
581 | movaps xmm1, xmm0 |
582 | mulps xmm1, xmm3 |
583 | addps xmm6, xmm1 |
584 | mulps xmm0, xmm4 |
585 | addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 |
586 | dec edx |
587 | jz .loop_end |
588 | ALIGN 16 |
589 | .loop_start: |
590 | ; start by reading the next sample |
591 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
592 | add eax, 4 |
593 | shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] |
594 | |
595 | ; shift xmm4:xmm3:xmm2 left by one float |
596 | shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float |
597 | shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float |
598 | shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float |
599 | movss xmm4, xmm3 |
600 | movss xmm3, xmm2 |
601 | movss xmm2, xmm0 |
602 | |
603 | ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 |
604 | movaps xmm1, xmm0 |
605 | mulps xmm1, xmm2 |
606 | addps xmm5, xmm1 |
607 | movaps xmm1, xmm0 |
608 | mulps xmm1, xmm3 |
609 | addps xmm6, xmm1 |
610 | mulps xmm0, xmm4 |
611 | addps xmm7, xmm0 |
612 | |
613 | dec edx |
614 | jnz .loop_start |
615 | .loop_end: |
616 | ; store autoc |
617 | mov edx, [esp + 16] ; edx == autoc |
618 | movups [edx], xmm5 |
619 | movups [edx + 16], xmm6 |
620 | movups [edx + 32], xmm7 |
621 | |
622 | .end: |
623 | ret |
624 | |
625 | ALIGN 16 |
626 | cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old |
627 | ;[ebp + 20] == autoc[] |
628 | ;[ebp + 16] == lag |
629 | ;[ebp + 12] == data_len |
630 | ;[ebp + 8] == data[] |
631 | ;[esp] == __m128 |
632 | ;[esp + 16] == __m128 |
633 | |
634 | push ebp |
635 | mov ebp, esp |
636 | and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps' |
637 | sub esp, 32 |
638 | |
639 | ;ASSERT(lag > 0) |
640 | ;ASSERT(lag <= 12) |
641 | ;ASSERT(lag <= data_len) |
642 | ;ASSERT(data_len > 0) |
643 | |
644 | ; for(coeff = 0; coeff < lag; coeff++) |
645 | ; autoc[coeff] = 0.0; |
646 | xorps xmm5, xmm5 |
647 | xorps xmm6, xmm6 |
648 | movaps [esp], xmm5 |
649 | movaps [esp + 16], xmm6 |
650 | |
651 | mov edx, [ebp + 12] ; edx == data_len |
652 | mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0] |
653 | |
654 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
655 | add eax, 4 |
656 | movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] |
657 | shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
658 | xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 |
659 | xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 |
660 | xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 |
661 | movaps xmm7, xmm0 |
662 | mulps xmm7, xmm1 |
663 | addps xmm5, xmm7 |
664 | dec edx |
665 | jz .loop_end |
666 | ALIGN 16 |
667 | .loop_start: |
668 | ; start by reading the next sample |
669 | movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
670 | add eax, 4 |
671 | shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] |
672 | |
673 | ; shift xmm4:xmm3:xmm2:xmm1 left by one float |
674 | shufps xmm1, xmm1, 93h |
675 | shufps xmm2, xmm2, 93h |
676 | shufps xmm3, xmm3, 93h |
677 | shufps xmm4, xmm4, 93h |
678 | movss xmm4, xmm3 |
679 | movss xmm3, xmm2 |
680 | movss xmm2, xmm1 |
681 | movss xmm1, xmm0 |
682 | |
683 | ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 |
684 | movaps xmm7, xmm0 |
685 | mulps xmm7, xmm1 |
686 | addps xmm5, xmm7 |
687 | movaps xmm7, xmm0 |
688 | mulps xmm7, xmm2 |
689 | addps xmm6, xmm7 |
690 | movaps xmm7, xmm0 |
691 | mulps xmm7, xmm3 |
692 | mulps xmm0, xmm4 |
693 | addps xmm7, [esp] |
694 | addps xmm0, [esp + 16] |
695 | movaps [esp], xmm7 |
696 | movaps [esp + 16], xmm0 |
697 | |
698 | dec edx |
699 | jnz .loop_start |
700 | .loop_end: |
701 | ; store autoc |
702 | mov edx, [ebp + 20] ; edx == autoc |
703 | movups [edx], xmm5 |
704 | movups [edx + 16], xmm6 |
705 | movaps xmm5, [esp] |
706 | movaps xmm6, [esp + 16] |
707 | movups [edx + 32], xmm5 |
708 | movups [edx + 48], xmm6 |
709 | .end: |
710 | mov esp, ebp |
711 | pop ebp |
712 | ret |
713 | |
714 | ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
715 | ; |
716 | ; for(i = 0; i < data_len; i++) { |
717 | ; sum = 0; |
718 | ; for(j = 0; j < order; j++) |
719 | ; sum += qlp_coeff[j] * data[i-j-1]; |
720 | ; residual[i] = data[i] - (sum >> lp_quantization); |
721 | ; } |
722 | ; |
723 | ALIGN 16 |
724 | cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 |
725 | ;[esp + 40] residual[] |
726 | ;[esp + 36] lp_quantization |
727 | ;[esp + 32] order |
728 | ;[esp + 28] qlp_coeff[] |
729 | ;[esp + 24] data_len |
730 | ;[esp + 20] data[] |
731 | |
732 | ;ASSERT(order > 0) |
733 | |
734 | push ebp |
735 | push ebx |
736 | push esi |
737 | push edi |
738 | |
739 | mov esi, [esp + 20] ; esi = data[] |
740 | mov edi, [esp + 40] ; edi = residual[] |
741 | mov eax, [esp + 32] ; eax = order |
742 | mov ebx, [esp + 24] ; ebx = data_len |
743 | |
744 | test ebx, ebx |
745 | jz near .end ; do nothing if data_len == 0 |
746 | .begin: |
747 | cmp eax, byte 1 |
748 | jg short .i_1more |
749 | |
750 | mov ecx, [esp + 28] |
751 | mov edx, [ecx] ; edx = qlp_coeff[0] |
752 | mov eax, [esi - 4] ; eax = data[-1] |
753 | mov ecx, [esp + 36] ; cl = lp_quantization |
754 | ALIGN 16 |
755 | .i_1_loop_i: |
756 | imul eax, edx |
757 | sar eax, cl |
758 | neg eax |
759 | add eax, [esi] |
760 | mov [edi], eax |
761 | mov eax, [esi] |
762 | add edi, byte 4 |
763 | add esi, byte 4 |
764 | dec ebx |
765 | jnz .i_1_loop_i |
766 | |
767 | jmp .end |
768 | |
769 | .i_1more: |
770 | cmp eax, byte 32 ; for order <= 32 there is a faster routine |
771 | jbe short .i_32 |
772 | |
773 | ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 |
774 | ALIGN 16 |
775 | .i_32more_loop_i: |
776 | xor ebp, ebp |
777 | mov ecx, [esp + 32] |
778 | mov edx, ecx |
779 | shl edx, 2 |
780 | add edx, [esp + 28] |
781 | neg ecx |
782 | ALIGN 16 |
783 | .i_32more_loop_j: |
784 | sub edx, byte 4 |
785 | mov eax, [edx] |
786 | imul eax, [esi + 4 * ecx] |
787 | add ebp, eax |
788 | inc ecx |
789 | jnz short .i_32more_loop_j |
790 | |
791 | mov ecx, [esp + 36] |
792 | sar ebp, cl |
793 | neg ebp |
794 | add ebp, [esi] |
795 | mov [edi], ebp |
796 | add esi, byte 4 |
797 | add edi, byte 4 |
798 | |
799 | dec ebx |
800 | jnz .i_32more_loop_i |
801 | |
802 | jmp .end |
803 | |
804 | .mov_eip_to_eax: |
805 | mov eax, [esp] |
806 | ret |
807 | |
808 | .i_32: |
809 | sub edi, esi |
810 | neg eax |
811 | lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] |
812 | call .mov_eip_to_eax |
813 | .get_eip0: |
814 | add edx, eax |
815 | inc edx |
816 | mov eax, [esp + 28] ; eax = qlp_coeff[] |
817 | xor ebp, ebp |
818 | jmp edx |
819 | |
820 | mov ecx, [eax + 124] |
821 | imul ecx, [esi - 128] |
822 | add ebp, ecx |
823 | mov ecx, [eax + 120] |
824 | imul ecx, [esi - 124] |
825 | add ebp, ecx |
826 | mov ecx, [eax + 116] |
827 | imul ecx, [esi - 120] |
828 | add ebp, ecx |
829 | mov ecx, [eax + 112] |
830 | imul ecx, [esi - 116] |
831 | add ebp, ecx |
832 | mov ecx, [eax + 108] |
833 | imul ecx, [esi - 112] |
834 | add ebp, ecx |
835 | mov ecx, [eax + 104] |
836 | imul ecx, [esi - 108] |
837 | add ebp, ecx |
838 | mov ecx, [eax + 100] |
839 | imul ecx, [esi - 104] |
840 | add ebp, ecx |
841 | mov ecx, [eax + 96] |
842 | imul ecx, [esi - 100] |
843 | add ebp, ecx |
844 | mov ecx, [eax + 92] |
845 | imul ecx, [esi - 96] |
846 | add ebp, ecx |
847 | mov ecx, [eax + 88] |
848 | imul ecx, [esi - 92] |
849 | add ebp, ecx |
850 | mov ecx, [eax + 84] |
851 | imul ecx, [esi - 88] |
852 | add ebp, ecx |
853 | mov ecx, [eax + 80] |
854 | imul ecx, [esi - 84] |
855 | add ebp, ecx |
856 | mov ecx, [eax + 76] |
857 | imul ecx, [esi - 80] |
858 | add ebp, ecx |
859 | mov ecx, [eax + 72] |
860 | imul ecx, [esi - 76] |
861 | add ebp, ecx |
862 | mov ecx, [eax + 68] |
863 | imul ecx, [esi - 72] |
864 | add ebp, ecx |
865 | mov ecx, [eax + 64] |
866 | imul ecx, [esi - 68] |
867 | add ebp, ecx |
868 | mov ecx, [eax + 60] |
869 | imul ecx, [esi - 64] |
870 | add ebp, ecx |
871 | mov ecx, [eax + 56] |
872 | imul ecx, [esi - 60] |
873 | add ebp, ecx |
874 | mov ecx, [eax + 52] |
875 | imul ecx, [esi - 56] |
876 | add ebp, ecx |
877 | mov ecx, [eax + 48] |
878 | imul ecx, [esi - 52] |
879 | add ebp, ecx |
880 | mov ecx, [eax + 44] |
881 | imul ecx, [esi - 48] |
882 | add ebp, ecx |
883 | mov ecx, [eax + 40] |
884 | imul ecx, [esi - 44] |
885 | add ebp, ecx |
886 | mov ecx, [eax + 36] |
887 | imul ecx, [esi - 40] |
888 | add ebp, ecx |
889 | mov ecx, [eax + 32] |
890 | imul ecx, [esi - 36] |
891 | add ebp, ecx |
892 | mov ecx, [eax + 28] |
893 | imul ecx, [esi - 32] |
894 | add ebp, ecx |
895 | mov ecx, [eax + 24] |
896 | imul ecx, [esi - 28] |
897 | add ebp, ecx |
898 | mov ecx, [eax + 20] |
899 | imul ecx, [esi - 24] |
900 | add ebp, ecx |
901 | mov ecx, [eax + 16] |
902 | imul ecx, [esi - 20] |
903 | add ebp, ecx |
904 | mov ecx, [eax + 12] |
905 | imul ecx, [esi - 16] |
906 | add ebp, ecx |
907 | mov ecx, [eax + 8] |
908 | imul ecx, [esi - 12] |
909 | add ebp, ecx |
910 | mov ecx, [eax + 4] |
911 | imul ecx, [esi - 8] |
912 | add ebp, ecx |
913 | mov ecx, [eax] ; there is one byte missing |
914 | imul ecx, [esi - 4] |
915 | add ebp, ecx |
916 | .jumper_0: |
917 | |
918 | mov ecx, [esp + 36] |
919 | sar ebp, cl |
920 | neg ebp |
921 | add ebp, [esi] |
922 | mov [edi + esi], ebp |
923 | add esi, byte 4 |
924 | |
925 | dec ebx |
926 | jz short .end |
927 | xor ebp, ebp |
928 | jmp edx |
929 | |
930 | .end: |
931 | pop edi |
932 | pop esi |
933 | pop ebx |
934 | pop ebp |
935 | ret |
936 | |
937 | ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for |
938 | ; the channel and qlp_coeffs must be <= 16. Especially note that this routine |
939 | ; cannot be used for side-channel coded 16bps channels since the effective bps |
940 | ; is 17. |
941 | ALIGN 16 |
942 | cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx |
943 | ;[esp + 40] residual[] |
944 | ;[esp + 36] lp_quantization |
945 | ;[esp + 32] order |
946 | ;[esp + 28] qlp_coeff[] |
947 | ;[esp + 24] data_len |
948 | ;[esp + 20] data[] |
949 | |
950 | ;ASSERT(order > 0) |
951 | |
952 | push ebp |
953 | push ebx |
954 | push esi |
955 | push edi |
956 | |
957 | mov esi, [esp + 20] ; esi = data[] |
958 | mov edi, [esp + 40] ; edi = residual[] |
959 | mov eax, [esp + 32] ; eax = order |
960 | mov ebx, [esp + 24] ; ebx = data_len |
961 | |
962 | test ebx, ebx |
963 | jz near .end ; do nothing if data_len == 0 |
964 | dec ebx |
965 | test ebx, ebx |
966 | jz near .last_one |
967 | |
968 | mov edx, [esp + 28] ; edx = qlp_coeff[] |
969 | movd mm6, [esp + 36] ; mm6 = 0:lp_quantization |
970 | mov ebp, esp |
971 | |
972 | and esp, 0xfffffff8 |
973 | |
974 | xor ecx, ecx |
975 | .copy_qlp_loop: |
976 | push word [edx + 4 * ecx] |
977 | inc ecx |
978 | cmp ecx, eax |
979 | jnz short .copy_qlp_loop |
980 | |
981 | and ecx, 0x3 |
982 | test ecx, ecx |
983 | je short .za_end |
984 | sub ecx, byte 4 |
985 | .za_loop: |
986 | push word 0 |
987 | inc eax |
988 | inc ecx |
989 | jnz short .za_loop |
990 | .za_end: |
991 | |
992 | movq mm5, [esp + 2 * eax - 8] |
993 | movd mm4, [esi - 16] |
994 | punpckldq mm4, [esi - 12] |
995 | movd mm0, [esi - 8] |
996 | punpckldq mm0, [esi - 4] |
997 | packssdw mm4, mm0 |
998 | |
999 | cmp eax, byte 4 |
1000 | jnbe short .mmx_4more |
1001 | |
1002 | ALIGN 16 |
1003 | .mmx_4_loop_i: |
1004 | movd mm1, [esi] |
1005 | movq mm3, mm4 |
1006 | punpckldq mm1, [esi + 4] |
1007 | psrlq mm4, 16 |
1008 | movq mm0, mm1 |
1009 | psllq mm0, 48 |
1010 | por mm4, mm0 |
1011 | movq mm2, mm4 |
1012 | psrlq mm4, 16 |
1013 | pxor mm0, mm0 |
1014 | punpckhdq mm0, mm1 |
1015 | pmaddwd mm3, mm5 |
1016 | pmaddwd mm2, mm5 |
1017 | psllq mm0, 16 |
1018 | por mm4, mm0 |
1019 | movq mm0, mm3 |
1020 | punpckldq mm3, mm2 |
1021 | punpckhdq mm0, mm2 |
1022 | paddd mm3, mm0 |
1023 | psrad mm3, mm6 |
1024 | psubd mm1, mm3 |
1025 | movd [edi], mm1 |
1026 | punpckhdq mm1, mm1 |
1027 | movd [edi + 4], mm1 |
1028 | |
1029 | add edi, byte 8 |
1030 | add esi, byte 8 |
1031 | |
1032 | sub ebx, 2 |
1033 | jg .mmx_4_loop_i |
1034 | jmp .mmx_end |
1035 | |
1036 | .mmx_4more: |
1037 | shl eax, 2 |
1038 | neg eax |
1039 | add eax, byte 16 |
1040 | |
1041 | ALIGN 16 |
1042 | .mmx_4more_loop_i: |
1043 | movd mm1, [esi] |
1044 | punpckldq mm1, [esi + 4] |
1045 | movq mm3, mm4 |
1046 | psrlq mm4, 16 |
1047 | movq mm0, mm1 |
1048 | psllq mm0, 48 |
1049 | por mm4, mm0 |
1050 | movq mm2, mm4 |
1051 | psrlq mm4, 16 |
1052 | pxor mm0, mm0 |
1053 | punpckhdq mm0, mm1 |
1054 | pmaddwd mm3, mm5 |
1055 | pmaddwd mm2, mm5 |
1056 | psllq mm0, 16 |
1057 | por mm4, mm0 |
1058 | |
1059 | mov ecx, esi |
1060 | add ecx, eax |
1061 | mov edx, esp |
1062 | |
1063 | ALIGN 16 |
1064 | .mmx_4more_loop_j: |
1065 | movd mm0, [ecx - 16] |
1066 | movd mm7, [ecx - 8] |
1067 | punpckldq mm0, [ecx - 12] |
1068 | punpckldq mm7, [ecx - 4] |
1069 | packssdw mm0, mm7 |
1070 | pmaddwd mm0, [edx] |
1071 | punpckhdq mm7, mm7 |
1072 | paddd mm3, mm0 |
1073 | movd mm0, [ecx - 12] |
1074 | punpckldq mm0, [ecx - 8] |
1075 | punpckldq mm7, [ecx] |
1076 | packssdw mm0, mm7 |
1077 | pmaddwd mm0, [edx] |
1078 | paddd mm2, mm0 |
1079 | |
1080 | add edx, byte 8 |
1081 | add ecx, byte 16 |
1082 | cmp ecx, esi |
1083 | jnz .mmx_4more_loop_j |
1084 | |
1085 | movq mm0, mm3 |
1086 | punpckldq mm3, mm2 |
1087 | punpckhdq mm0, mm2 |
1088 | paddd mm3, mm0 |
1089 | psrad mm3, mm6 |
1090 | psubd mm1, mm3 |
1091 | movd [edi], mm1 |
1092 | punpckhdq mm1, mm1 |
1093 | movd [edi + 4], mm1 |
1094 | |
1095 | add edi, byte 8 |
1096 | add esi, byte 8 |
1097 | |
1098 | sub ebx, 2 |
1099 | jg near .mmx_4more_loop_i |
1100 | |
1101 | .mmx_end: |
1102 | emms |
1103 | mov esp, ebp |
1104 | .last_one: |
1105 | mov eax, [esp + 32] |
1106 | inc ebx |
1107 | jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin |
1108 | |
1109 | .end: |
1110 | pop edi |
1111 | pop esi |
1112 | pop ebx |
1113 | pop ebp |
1114 | ret |
1115 | |
1116 | ; ********************************************************************** |
1117 | ; |
1118 | ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) |
1119 | ; { |
1120 | ; unsigned i, j; |
1121 | ; FLAC__int32 sum; |
1122 | ; |
1123 | ; FLAC__ASSERT(order > 0); |
1124 | ; |
1125 | ; for(i = 0; i < data_len; i++) { |
1126 | ; sum = 0; |
1127 | ; for(j = 0; j < order; j++) |
1128 | ; sum += qlp_coeff[j] * data[i-j-1]; |
1129 | ; data[i] = residual[i] + (sum >> lp_quantization); |
1130 | ; } |
1131 | ; } |
1132 | ALIGN 16 |
1133 | cident FLAC__lpc_restore_signal_asm_ia32 |
1134 | ;[esp + 40] data[] |
1135 | ;[esp + 36] lp_quantization |
1136 | ;[esp + 32] order |
1137 | ;[esp + 28] qlp_coeff[] |
1138 | ;[esp + 24] data_len |
1139 | ;[esp + 20] residual[] |
1140 | |
1141 | ;ASSERT(order > 0) |
1142 | |
1143 | push ebp |
1144 | push ebx |
1145 | push esi |
1146 | push edi |
1147 | |
1148 | mov esi, [esp + 20] ; esi = residual[] |
1149 | mov edi, [esp + 40] ; edi = data[] |
1150 | mov eax, [esp + 32] ; eax = order |
1151 | mov ebx, [esp + 24] ; ebx = data_len |
1152 | |
1153 | test ebx, ebx |
1154 | jz near .end ; do nothing if data_len == 0 |
1155 | |
1156 | .begin: |
1157 | cmp eax, byte 1 |
1158 | jg short .x87_1more |
1159 | |
1160 | mov ecx, [esp + 28] |
1161 | mov edx, [ecx] |
1162 | mov eax, [edi - 4] |
1163 | mov ecx, [esp + 36] |
1164 | ALIGN 16 |
1165 | .x87_1_loop_i: |
1166 | imul eax, edx |
1167 | sar eax, cl |
1168 | add eax, [esi] |
1169 | mov [edi], eax |
1170 | add esi, byte 4 |
1171 | add edi, byte 4 |
1172 | dec ebx |
1173 | jnz .x87_1_loop_i |
1174 | |
1175 | jmp .end |
1176 | |
1177 | .x87_1more: |
1178 | cmp eax, byte 32 ; for order <= 32 there is a faster routine |
1179 | jbe short .x87_32 |
1180 | |
1181 | ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 |
1182 | ALIGN 16 |
1183 | .x87_32more_loop_i: |
1184 | xor ebp, ebp |
1185 | mov ecx, [esp + 32] |
1186 | mov edx, ecx |
1187 | shl edx, 2 |
1188 | add edx, [esp + 28] |
1189 | neg ecx |
1190 | ALIGN 16 |
1191 | .x87_32more_loop_j: |
1192 | sub edx, byte 4 |
1193 | mov eax, [edx] |
1194 | imul eax, [edi + 4 * ecx] |
1195 | add ebp, eax |
1196 | inc ecx |
1197 | jnz short .x87_32more_loop_j |
1198 | |
1199 | mov ecx, [esp + 36] |
1200 | sar ebp, cl |
1201 | add ebp, [esi] |
1202 | mov [edi], ebp |
1203 | add edi, byte 4 |
1204 | add esi, byte 4 |
1205 | |
1206 | dec ebx |
1207 | jnz .x87_32more_loop_i |
1208 | |
1209 | jmp .end |
1210 | |
1211 | .mov_eip_to_eax: |
1212 | mov eax, [esp] |
1213 | ret |
1214 | |
1215 | .x87_32: |
1216 | sub esi, edi |
1217 | neg eax |
1218 | lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] |
1219 | call .mov_eip_to_eax |
1220 | .get_eip0: |
1221 | add edx, eax |
1222 | inc edx ; compensate for the shorter opcode on the last iteration |
1223 | mov eax, [esp + 28] ; eax = qlp_coeff[] |
1224 | xor ebp, ebp |
1225 | jmp edx |
1226 | |
1227 | mov ecx, [eax + 124] ; ecx = qlp_coeff[31] |
1228 | imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] |
1229 | add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] |
1230 | mov ecx, [eax + 120] ; ecx = qlp_coeff[30] |
1231 | imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] |
1232 | add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] |
1233 | mov ecx, [eax + 116] ; ecx = qlp_coeff[29] |
1234 | imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] |
1235 | add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] |
1236 | mov ecx, [eax + 112] ; ecx = qlp_coeff[28] |
1237 | imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] |
1238 | add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] |
1239 | mov ecx, [eax + 108] ; ecx = qlp_coeff[27] |
1240 | imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] |
1241 | add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] |
1242 | mov ecx, [eax + 104] ; ecx = qlp_coeff[26] |
1243 | imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] |
1244 | add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] |
1245 | mov ecx, [eax + 100] ; ecx = qlp_coeff[25] |
1246 | imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] |
1247 | add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] |
1248 | mov ecx, [eax + 96] ; ecx = qlp_coeff[24] |
1249 | imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] |
1250 | add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] |
1251 | mov ecx, [eax + 92] ; ecx = qlp_coeff[23] |
1252 | imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] |
1253 | add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] |
1254 | mov ecx, [eax + 88] ; ecx = qlp_coeff[22] |
1255 | imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] |
1256 | add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] |
1257 | mov ecx, [eax + 84] ; ecx = qlp_coeff[21] |
1258 | imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] |
1259 | add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] |
1260 | mov ecx, [eax + 80] ; ecx = qlp_coeff[20] |
1261 | imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] |
1262 | add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] |
1263 | mov ecx, [eax + 76] ; ecx = qlp_coeff[19] |
1264 | imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] |
1265 | add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] |
1266 | mov ecx, [eax + 72] ; ecx = qlp_coeff[18] |
1267 | imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] |
1268 | add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] |
1269 | mov ecx, [eax + 68] ; ecx = qlp_coeff[17] |
1270 | imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] |
1271 | add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] |
1272 | mov ecx, [eax + 64] ; ecx = qlp_coeff[16] |
1273 | imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] |
1274 | add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] |
1275 | mov ecx, [eax + 60] ; ecx = qlp_coeff[15] |
1276 | imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] |
1277 | add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] |
1278 | mov ecx, [eax + 56] ; ecx = qlp_coeff[14] |
1279 | imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] |
1280 | add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] |
1281 | mov ecx, [eax + 52] ; ecx = qlp_coeff[13] |
1282 | imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] |
1283 | add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] |
1284 | mov ecx, [eax + 48] ; ecx = qlp_coeff[12] |
1285 | imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] |
1286 | add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] |
1287 | mov ecx, [eax + 44] ; ecx = qlp_coeff[11] |
1288 | imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] |
1289 | add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] |
1290 | mov ecx, [eax + 40] ; ecx = qlp_coeff[10] |
1291 | imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] |
1292 | add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] |
1293 | mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] |
1294 | imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] |
1295 | add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] |
1296 | mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] |
1297 | imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] |
1298 | add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] |
1299 | mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] |
1300 | imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] |
1301 | add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] |
1302 | mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] |
1303 | imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] |
1304 | add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] |
1305 | mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] |
1306 | imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] |
1307 | add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] |
1308 | mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] |
1309 | imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] |
1310 | add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] |
1311 | mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] |
1312 | imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] |
1313 | add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] |
1314 | mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] |
1315 | imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] |
1316 | add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] |
1317 | mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] |
1318 | imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] |
1319 | add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] |
1320 | mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) |
1321 | imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] |
1322 | add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] |
1323 | .jumper_0: |
1324 | |
1325 | mov ecx, [esp + 36] |
1326 | sar ebp, cl ; ebp = (sum >> lp_quantization) |
1327 | add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) |
1328 | mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) |
1329 | add edi, byte 4 |
1330 | |
1331 | dec ebx |
1332 | jz short .end |
1333 | xor ebp, ebp |
1334 | jmp edx |
1335 | |
1336 | .end: |
1337 | pop edi |
1338 | pop esi |
1339 | pop ebx |
1340 | pop ebp |
1341 | ret |
1342 | |
1343 | ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for |
1344 | ; the channel and qlp_coeffs must be <= 16. Especially note that this routine |
1345 | ; cannot be used for side-channel coded 16bps channels since the effective bps |
1346 | ; is 17. |
1347 | ; WATCHOUT: this routine requires that each data array have a buffer of up to |
1348 | ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each |
1349 | ; channel n, data[n][-1] through data[n][-3] should be accessible and zero. |
1350 | ALIGN 16 |
1351 | cident FLAC__lpc_restore_signal_asm_ia32_mmx |
1352 | ;[esp + 40] data[] |
1353 | ;[esp + 36] lp_quantization |
1354 | ;[esp + 32] order |
1355 | ;[esp + 28] qlp_coeff[] |
1356 | ;[esp + 24] data_len |
1357 | ;[esp + 20] residual[] |
1358 | |
1359 | ;ASSERT(order > 0) |
1360 | |
1361 | push ebp |
1362 | push ebx |
1363 | push esi |
1364 | push edi |
1365 | |
1366 | mov esi, [esp + 20] |
1367 | mov edi, [esp + 40] |
1368 | mov eax, [esp + 32] |
1369 | mov ebx, [esp + 24] |
1370 | |
1371 | test ebx, ebx |
1372 | jz near .end ; do nothing if data_len == 0 |
1373 | cmp eax, byte 4 |
1374 | jb near FLAC__lpc_restore_signal_asm_ia32.begin |
1375 | |
1376 | mov edx, [esp + 28] |
1377 | movd mm6, [esp + 36] |
1378 | mov ebp, esp |
1379 | |
1380 | and esp, 0xfffffff8 |
1381 | |
1382 | xor ecx, ecx |
1383 | .copy_qlp_loop: |
1384 | push word [edx + 4 * ecx] |
1385 | inc ecx |
1386 | cmp ecx, eax |
1387 | jnz short .copy_qlp_loop |
1388 | |
1389 | and ecx, 0x3 |
1390 | test ecx, ecx |
1391 | je short .za_end |
1392 | sub ecx, byte 4 |
1393 | .za_loop: |
1394 | push word 0 |
1395 | inc eax |
1396 | inc ecx |
1397 | jnz short .za_loop |
1398 | .za_end: |
1399 | |
1400 | movq mm5, [esp + 2 * eax - 8] |
1401 | movd mm4, [edi - 16] |
1402 | punpckldq mm4, [edi - 12] |
1403 | movd mm0, [edi - 8] |
1404 | punpckldq mm0, [edi - 4] |
1405 | packssdw mm4, mm0 |
1406 | |
1407 | cmp eax, byte 4 |
1408 | jnbe short .mmx_4more |
1409 | |
1410 | ALIGN 16 |
1411 | .mmx_4_loop_i: |
1412 | movq mm7, mm4 |
1413 | pmaddwd mm7, mm5 |
1414 | movq mm0, mm7 |
1415 | punpckhdq mm7, mm7 |
1416 | paddd mm7, mm0 |
1417 | psrad mm7, mm6 |
1418 | movd mm1, [esi] |
1419 | paddd mm7, mm1 |
1420 | movd [edi], mm7 |
1421 | psllq mm7, 48 |
1422 | psrlq mm4, 16 |
1423 | por mm4, mm7 |
1424 | |
1425 | add esi, byte 4 |
1426 | add edi, byte 4 |
1427 | |
1428 | dec ebx |
1429 | jnz .mmx_4_loop_i |
1430 | jmp .mmx_end |
1431 | .mmx_4more: |
1432 | shl eax, 2 |
1433 | neg eax |
1434 | add eax, byte 16 |
1435 | ALIGN 16 |
1436 | .mmx_4more_loop_i: |
1437 | mov ecx, edi |
1438 | add ecx, eax |
1439 | mov edx, esp |
1440 | |
1441 | movq mm7, mm4 |
1442 | pmaddwd mm7, mm5 |
1443 | |
1444 | ALIGN 16 |
1445 | .mmx_4more_loop_j: |
1446 | movd mm0, [ecx - 16] |
1447 | punpckldq mm0, [ecx - 12] |
1448 | movd mm1, [ecx - 8] |
1449 | punpckldq mm1, [ecx - 4] |
1450 | packssdw mm0, mm1 |
1451 | pmaddwd mm0, [edx] |
1452 | paddd mm7, mm0 |
1453 | |
1454 | add edx, byte 8 |
1455 | add ecx, byte 16 |
1456 | cmp ecx, edi |
1457 | jnz .mmx_4more_loop_j |
1458 | |
1459 | movq mm0, mm7 |
1460 | punpckhdq mm7, mm7 |
1461 | paddd mm7, mm0 |
1462 | psrad mm7, mm6 |
1463 | movd mm1, [esi] |
1464 | paddd mm7, mm1 |
1465 | movd [edi], mm7 |
1466 | psllq mm7, 48 |
1467 | psrlq mm4, 16 |
1468 | por mm4, mm7 |
1469 | |
1470 | add esi, byte 4 |
1471 | add edi, byte 4 |
1472 | |
1473 | dec ebx |
1474 | jnz short .mmx_4more_loop_i |
1475 | .mmx_end: |
1476 | emms |
1477 | mov esp, ebp |
1478 | |
1479 | .end: |
1480 | pop edi |
1481 | pop esi |
1482 | pop ebx |
1483 | pop ebp |
1484 | ret |
1485 | |
1486 | |
1487 | ; ********************************************************************** |
1488 | ; |
1489 | ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) |
1490 | ; { |
1491 | ; unsigned i, j; |
1492 | ; FLAC__int64 sum; |
1493 | ; |
1494 | ; FLAC__ASSERT(order > 0); |
1495 | ; |
1496 | ; for(i = 0; i < data_len; i++) { |
1497 | ; sum = 0; |
1498 | ; for(j = 0; j < order; j++) |
1499 | ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; |
1500 | ; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); |
1501 | ; } |
1502 | ; } |
1503 | ALIGN 16 |
1504 | cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 |
1505 | ;[esp + 40] residual[] |
1506 | ;[esp + 36] lp_quantization |
1507 | ;[esp + 32] order |
1508 | ;[esp + 28] qlp_coeff[] |
1509 | ;[esp + 24] data_len |
1510 | ;[esp + 20] data[] |
1511 | |
1512 | ;ASSERT(order > 0) |
1513 | ;ASSERT(order <= 32) |
1514 | ;ASSERT(lp_quantization <= 31) |
1515 | |
1516 | push ebp |
1517 | push ebx |
1518 | push esi |
1519 | push edi |
1520 | |
1521 | mov ebx, [esp + 24] ; ebx = data_len |
1522 | test ebx, ebx |
1523 | jz near .end ; do nothing if data_len == 0 |
1524 | |
1525 | .begin: |
1526 | mov eax, [esp + 32] ; eax = order |
1527 | cmp eax, 1 |
1528 | jg short .i_32 |
1529 | |
1530 | mov esi, [esp + 40] ; esi = residual[] |
1531 | mov edi, [esp + 20] ; edi = data[] |
1532 | mov ecx, [esp + 28] ; ecx = qlp_coeff[] |
1533 | mov ebp, [ecx] ; ebp = qlp_coeff[0] |
1534 | mov eax, [edi - 4] ; eax = data[-1] |
1535 | mov ecx, [esp + 36] ; cl = lp_quantization |
1536 | ALIGN 16 |
1537 | .i_1_loop_i: |
1538 | imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] |
1539 | shrd eax, edx, cl ; 0 <= lp_quantization <= 15 |
1540 | neg eax |
1541 | add eax, [edi] |
1542 | mov [esi], eax |
1543 | mov eax, [edi] |
1544 | add esi, 4 |
1545 | add edi, 4 |
1546 | dec ebx |
1547 | jnz .i_1_loop_i |
1548 | jmp .end |
1549 | |
1550 | .mov_eip_to_eax: |
1551 | mov eax, [esp] |
1552 | ret |
1553 | |
1554 | .i_32: ; eax = order |
1555 | neg eax |
1556 | add eax, eax |
1557 | lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] |
1558 | call .mov_eip_to_eax |
1559 | .get_eip0: |
1560 | add ebp, eax |
1561 | inc ebp ; compensate for the shorter opcode on the last iteration |
1562 | |
1563 | mov ebx, [esp + 28] ; ebx = qlp_coeff[] |
1564 | mov edi, [esp + 20] ; edi = data[] |
1565 | sub [esp + 40], edi ; residual[] -= data[] |
1566 | |
1567 | xor ecx, ecx |
1568 | xor esi, esi |
1569 | jmp ebp |
1570 | |
1571 | ;eax = -- |
1572 | ;edx = -- |
1573 | ;ecx = 0 |
1574 | ;esi = 0 |
1575 | ; |
1576 | ;ebx = qlp_coeff[] |
1577 | ;edi = data[] |
1578 | ;ebp = @address |
1579 | |
1580 | mov eax, [ebx + 124] ; eax = qlp_coeff[31] |
1581 | imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] |
1582 | add ecx, eax |
1583 | adc esi, edx ; sum += qlp_coeff[31] * data[i-32] |
1584 | |
1585 | mov eax, [ebx + 120] ; eax = qlp_coeff[30] |
1586 | imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] |
1587 | add ecx, eax |
1588 | adc esi, edx ; sum += qlp_coeff[30] * data[i-31] |
1589 | |
1590 | mov eax, [ebx + 116] |
1591 | imul dword [edi - 120] |
1592 | add ecx, eax |
1593 | adc esi, edx |
1594 | |
1595 | mov eax, [ebx + 112] |
1596 | imul dword [edi - 116] |
1597 | add ecx, eax |
1598 | adc esi, edx |
1599 | |
1600 | mov eax, [ebx + 108] |
1601 | imul dword [edi - 112] |
1602 | add ecx, eax |
1603 | adc esi, edx |
1604 | |
1605 | mov eax, [ebx + 104] |
1606 | imul dword [edi - 108] |
1607 | add ecx, eax |
1608 | adc esi, edx |
1609 | |
1610 | mov eax, [ebx + 100] |
1611 | imul dword [edi - 104] |
1612 | add ecx, eax |
1613 | adc esi, edx |
1614 | |
1615 | mov eax, [ebx + 96] |
1616 | imul dword [edi - 100] |
1617 | add ecx, eax |
1618 | adc esi, edx |
1619 | |
1620 | mov eax, [ebx + 92] |
1621 | imul dword [edi - 96] |
1622 | add ecx, eax |
1623 | adc esi, edx |
1624 | |
1625 | mov eax, [ebx + 88] |
1626 | imul dword [edi - 92] |
1627 | add ecx, eax |
1628 | adc esi, edx |
1629 | |
1630 | mov eax, [ebx + 84] |
1631 | imul dword [edi - 88] |
1632 | add ecx, eax |
1633 | adc esi, edx |
1634 | |
1635 | mov eax, [ebx + 80] |
1636 | imul dword [edi - 84] |
1637 | add ecx, eax |
1638 | adc esi, edx |
1639 | |
1640 | mov eax, [ebx + 76] |
1641 | imul dword [edi - 80] |
1642 | add ecx, eax |
1643 | adc esi, edx |
1644 | |
1645 | mov eax, [ebx + 72] |
1646 | imul dword [edi - 76] |
1647 | add ecx, eax |
1648 | adc esi, edx |
1649 | |
1650 | mov eax, [ebx + 68] |
1651 | imul dword [edi - 72] |
1652 | add ecx, eax |
1653 | adc esi, edx |
1654 | |
1655 | mov eax, [ebx + 64] |
1656 | imul dword [edi - 68] |
1657 | add ecx, eax |
1658 | adc esi, edx |
1659 | |
1660 | mov eax, [ebx + 60] |
1661 | imul dword [edi - 64] |
1662 | add ecx, eax |
1663 | adc esi, edx |
1664 | |
1665 | mov eax, [ebx + 56] |
1666 | imul dword [edi - 60] |
1667 | add ecx, eax |
1668 | adc esi, edx |
1669 | |
1670 | mov eax, [ebx + 52] |
1671 | imul dword [edi - 56] |
1672 | add ecx, eax |
1673 | adc esi, edx |
1674 | |
1675 | mov eax, [ebx + 48] |
1676 | imul dword [edi - 52] |
1677 | add ecx, eax |
1678 | adc esi, edx |
1679 | |
1680 | mov eax, [ebx + 44] |
1681 | imul dword [edi - 48] |
1682 | add ecx, eax |
1683 | adc esi, edx |
1684 | |
1685 | mov eax, [ebx + 40] |
1686 | imul dword [edi - 44] |
1687 | add ecx, eax |
1688 | adc esi, edx |
1689 | |
1690 | mov eax, [ebx + 36] |
1691 | imul dword [edi - 40] |
1692 | add ecx, eax |
1693 | adc esi, edx |
1694 | |
1695 | mov eax, [ebx + 32] |
1696 | imul dword [edi - 36] |
1697 | add ecx, eax |
1698 | adc esi, edx |
1699 | |
1700 | mov eax, [ebx + 28] |
1701 | imul dword [edi - 32] |
1702 | add ecx, eax |
1703 | adc esi, edx |
1704 | |
1705 | mov eax, [ebx + 24] |
1706 | imul dword [edi - 28] |
1707 | add ecx, eax |
1708 | adc esi, edx |
1709 | |
1710 | mov eax, [ebx + 20] |
1711 | imul dword [edi - 24] |
1712 | add ecx, eax |
1713 | adc esi, edx |
1714 | |
1715 | mov eax, [ebx + 16] |
1716 | imul dword [edi - 20] |
1717 | add ecx, eax |
1718 | adc esi, edx |
1719 | |
1720 | mov eax, [ebx + 12] |
1721 | imul dword [edi - 16] |
1722 | add ecx, eax |
1723 | adc esi, edx |
1724 | |
1725 | mov eax, [ebx + 8] |
1726 | imul dword [edi - 12] |
1727 | add ecx, eax |
1728 | adc esi, edx |
1729 | |
1730 | mov eax, [ebx + 4] |
1731 | imul dword [edi - 8] |
1732 | add ecx, eax |
1733 | adc esi, edx |
1734 | |
1735 | mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) |
1736 | imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] |
1737 | add ecx, eax |
1738 | adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] |
1739 | |
1740 | .jumper_0: |
1741 | mov edx, ecx |
1742 | ;esi:edx = sum |
1743 | mov ecx, [esp + 36] ; cl = lp_quantization |
1744 | shrd edx, esi, cl ; edx = (sum >> lp_quantization) |
1745 | ;eax = -- |
1746 | ;ecx = -- |
1747 | ;edx = sum >> lp_q |
1748 | ;esi = -- |
1749 | neg edx ; edx = -(sum >> lp_quantization) |
1750 | mov eax, [esp + 40] ; residual[] - data[] |
1751 | add edx, [edi] ; edx = data[i] - (sum >> lp_quantization) |
1752 | mov [edi + eax], edx |
1753 | add edi, 4 |
1754 | |
1755 | dec dword [esp + 24] |
1756 | jz short .end |
1757 | xor ecx, ecx |
1758 | xor esi, esi |
1759 | jmp ebp |
1760 | |
1761 | .end: |
1762 | pop edi |
1763 | pop esi |
1764 | pop ebx |
1765 | pop ebp |
1766 | ret |
1767 | |
1768 | ; ********************************************************************** |
1769 | ; |
1770 | ; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) |
1771 | ; { |
1772 | ; unsigned i, j; |
1773 | ; FLAC__int64 sum; |
1774 | ; |
1775 | ; FLAC__ASSERT(order > 0); |
1776 | ; |
1777 | ; for(i = 0; i < data_len; i++) { |
1778 | ; sum = 0; |
1779 | ; for(j = 0; j < order; j++) |
1780 | ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; |
1781 | ; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); |
1782 | ; } |
1783 | ; } |
1784 | ALIGN 16 |
1785 | cident FLAC__lpc_restore_signal_wide_asm_ia32 |
1786 | ;[esp + 40] data[] |
1787 | ;[esp + 36] lp_quantization |
1788 | ;[esp + 32] order |
1789 | ;[esp + 28] qlp_coeff[] |
1790 | ;[esp + 24] data_len |
1791 | ;[esp + 20] residual[] |
1792 | |
1793 | ;ASSERT(order > 0) |
1794 | ;ASSERT(order <= 32) |
1795 | ;ASSERT(lp_quantization <= 31) |
1796 | |
1797 | push ebp |
1798 | push ebx |
1799 | push esi |
1800 | push edi |
1801 | |
1802 | mov ebx, [esp + 24] ; ebx = data_len |
1803 | test ebx, ebx |
1804 | jz near .end ; do nothing if data_len == 0 |
1805 | |
1806 | .begin: |
1807 | mov eax, [esp + 32] ; eax = order |
1808 | cmp eax, 1 |
1809 | jg short .x87_32 |
1810 | |
1811 | mov esi, [esp + 20] ; esi = residual[] |
1812 | mov edi, [esp + 40] ; edi = data[] |
1813 | mov ecx, [esp + 28] ; ecx = qlp_coeff[] |
1814 | mov ebp, [ecx] ; ebp = qlp_coeff[0] |
1815 | mov eax, [edi - 4] ; eax = data[-1] |
1816 | mov ecx, [esp + 36] ; cl = lp_quantization |
1817 | ALIGN 16 |
1818 | .x87_1_loop_i: |
1819 | imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] |
1820 | shrd eax, edx, cl ; 0 <= lp_quantization <= 15 |
1821 | ; |
1822 | add eax, [esi] |
1823 | mov [edi], eax |
1824 | ; |
1825 | add esi, 4 |
1826 | add edi, 4 |
1827 | dec ebx |
1828 | jnz .x87_1_loop_i |
1829 | jmp .end |
1830 | |
1831 | .mov_eip_to_eax: |
1832 | mov eax, [esp] |
1833 | ret |
1834 | |
1835 | .x87_32: ; eax = order |
1836 | neg eax |
1837 | add eax, eax |
1838 | lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] |
1839 | call .mov_eip_to_eax |
1840 | .get_eip0: |
1841 | add ebp, eax |
1842 | inc ebp ; compensate for the shorter opcode on the last iteration |
1843 | |
1844 | mov ebx, [esp + 28] ; ebx = qlp_coeff[] |
1845 | mov edi, [esp + 40] ; esi = data[] |
1846 | sub [esp + 20], edi ; residual[] -= data[] |
1847 | |
1848 | xor ecx, ecx |
1849 | xor esi, esi |
1850 | jmp ebp |
1851 | |
1852 | ;eax = -- |
1853 | ;edx = -- |
1854 | ;ecx = 0 |
1855 | ;esi = 0 |
1856 | ; |
1857 | ;ebx = qlp_coeff[] |
1858 | ;edi = data[] |
1859 | ;ebp = @address |
1860 | |
1861 | mov eax, [ebx + 124] ; eax = qlp_coeff[31] |
1862 | imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] |
1863 | add ecx, eax |
1864 | adc esi, edx ; sum += qlp_coeff[31] * data[i-32] |
1865 | |
1866 | mov eax, [ebx + 120] ; eax = qlp_coeff[30] |
1867 | imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] |
1868 | add ecx, eax |
1869 | adc esi, edx ; sum += qlp_coeff[30] * data[i-31] |
1870 | |
1871 | mov eax, [ebx + 116] |
1872 | imul dword [edi - 120] |
1873 | add ecx, eax |
1874 | adc esi, edx |
1875 | |
1876 | mov eax, [ebx + 112] |
1877 | imul dword [edi - 116] |
1878 | add ecx, eax |
1879 | adc esi, edx |
1880 | |
1881 | mov eax, [ebx + 108] |
1882 | imul dword [edi - 112] |
1883 | add ecx, eax |
1884 | adc esi, edx |
1885 | |
1886 | mov eax, [ebx + 104] |
1887 | imul dword [edi - 108] |
1888 | add ecx, eax |
1889 | adc esi, edx |
1890 | |
1891 | mov eax, [ebx + 100] |
1892 | imul dword [edi - 104] |
1893 | add ecx, eax |
1894 | adc esi, edx |
1895 | |
1896 | mov eax, [ebx + 96] |
1897 | imul dword [edi - 100] |
1898 | add ecx, eax |
1899 | adc esi, edx |
1900 | |
1901 | mov eax, [ebx + 92] |
1902 | imul dword [edi - 96] |
1903 | add ecx, eax |
1904 | adc esi, edx |
1905 | |
1906 | mov eax, [ebx + 88] |
1907 | imul dword [edi - 92] |
1908 | add ecx, eax |
1909 | adc esi, edx |
1910 | |
1911 | mov eax, [ebx + 84] |
1912 | imul dword [edi - 88] |
1913 | add ecx, eax |
1914 | adc esi, edx |
1915 | |
1916 | mov eax, [ebx + 80] |
1917 | imul dword [edi - 84] |
1918 | add ecx, eax |
1919 | adc esi, edx |
1920 | |
1921 | mov eax, [ebx + 76] |
1922 | imul dword [edi - 80] |
1923 | add ecx, eax |
1924 | adc esi, edx |
1925 | |
1926 | mov eax, [ebx + 72] |
1927 | imul dword [edi - 76] |
1928 | add ecx, eax |
1929 | adc esi, edx |
1930 | |
1931 | mov eax, [ebx + 68] |
1932 | imul dword [edi - 72] |
1933 | add ecx, eax |
1934 | adc esi, edx |
1935 | |
1936 | mov eax, [ebx + 64] |
1937 | imul dword [edi - 68] |
1938 | add ecx, eax |
1939 | adc esi, edx |
1940 | |
1941 | mov eax, [ebx + 60] |
1942 | imul dword [edi - 64] |
1943 | add ecx, eax |
1944 | adc esi, edx |
1945 | |
1946 | mov eax, [ebx + 56] |
1947 | imul dword [edi - 60] |
1948 | add ecx, eax |
1949 | adc esi, edx |
1950 | |
1951 | mov eax, [ebx + 52] |
1952 | imul dword [edi - 56] |
1953 | add ecx, eax |
1954 | adc esi, edx |
1955 | |
1956 | mov eax, [ebx + 48] |
1957 | imul dword [edi - 52] |
1958 | add ecx, eax |
1959 | adc esi, edx |
1960 | |
1961 | mov eax, [ebx + 44] |
1962 | imul dword [edi - 48] |
1963 | add ecx, eax |
1964 | adc esi, edx |
1965 | |
1966 | mov eax, [ebx + 40] |
1967 | imul dword [edi - 44] |
1968 | add ecx, eax |
1969 | adc esi, edx |
1970 | |
1971 | mov eax, [ebx + 36] |
1972 | imul dword [edi - 40] |
1973 | add ecx, eax |
1974 | adc esi, edx |
1975 | |
1976 | mov eax, [ebx + 32] |
1977 | imul dword [edi - 36] |
1978 | add ecx, eax |
1979 | adc esi, edx |
1980 | |
1981 | mov eax, [ebx + 28] |
1982 | imul dword [edi - 32] |
1983 | add ecx, eax |
1984 | adc esi, edx |
1985 | |
1986 | mov eax, [ebx + 24] |
1987 | imul dword [edi - 28] |
1988 | add ecx, eax |
1989 | adc esi, edx |
1990 | |
1991 | mov eax, [ebx + 20] |
1992 | imul dword [edi - 24] |
1993 | add ecx, eax |
1994 | adc esi, edx |
1995 | |
1996 | mov eax, [ebx + 16] |
1997 | imul dword [edi - 20] |
1998 | add ecx, eax |
1999 | adc esi, edx |
2000 | |
2001 | mov eax, [ebx + 12] |
2002 | imul dword [edi - 16] |
2003 | add ecx, eax |
2004 | adc esi, edx |
2005 | |
2006 | mov eax, [ebx + 8] |
2007 | imul dword [edi - 12] |
2008 | add ecx, eax |
2009 | adc esi, edx |
2010 | |
2011 | mov eax, [ebx + 4] |
2012 | imul dword [edi - 8] |
2013 | add ecx, eax |
2014 | adc esi, edx |
2015 | |
2016 | mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) |
2017 | imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] |
2018 | add ecx, eax |
2019 | adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] |
2020 | |
2021 | .jumper_0: |
2022 | mov edx, ecx |
2023 | ;esi:edx = sum |
2024 | mov ecx, [esp + 36] ; cl = lp_quantization |
2025 | shrd edx, esi, cl ; edx = (sum >> lp_quantization) |
2026 | ;eax = -- |
2027 | ;ecx = -- |
2028 | ;edx = sum >> lp_q |
2029 | ;esi = -- |
2030 | ; |
2031 | mov eax, [esp + 20] ; residual[] - data[] |
2032 | add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) |
2033 | mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) |
2034 | add edi, 4 |
2035 | |
2036 | dec dword [esp + 24] |
2037 | jz short .end |
2038 | xor ecx, ecx |
2039 | xor esi, esi |
2040 | jmp ebp |
2041 | |
2042 | .end: |
2043 | pop edi |
2044 | pop esi |
2045 | pop ebx |
2046 | pop ebp |
2047 | ret |
2048 | |
2049 | ; end |