deps/flac-1.3.2/src/libFLAC/ia32/lpc_asm.nasm

   1 ;  vim:filetype=nasm ts=8
   2
   3 ;  libFLAC - Free Lossless Audio Codec library
   4 ;  Copyright (C) 2001-2009  Josh Coalson
   5 ;  Copyright (C) 2011-2016  Xiph.Org Foundation
   6 ;
   7 ;  Redistribution and use in source and binary forms, with or without
   8 ;  modification, are permitted provided that the following conditions
   9 ;  are met:
  10 ;
  11 ;  - Redistributions of source code must retain the above copyright
  12 ;  notice, this list of conditions and the following disclaimer.
  13 ;
  14 ;  - Redistributions in binary form must reproduce the above copyright
  15 ;  notice, this list of conditions and the following disclaimer in the
  16 ;  documentation and/or other materials provided with the distribution.
  17 ;
  18 ;  - Neither the name of the Xiph.org Foundation nor the names of its
  19 ;  contributors may be used to endorse or promote products derived from
  20 ;  this software without specific prior written permission.
  21 ;
  22 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
  26 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  27 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  28 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  29 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  30 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  31 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  32 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33
  34 %include "nasm.h"
  35
  36         data_section
  37
  38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
  39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
  40 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
  41 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
  42 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
  43 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
  44 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
  45 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
  46 cglobal FLAC__lpc_restore_signal_asm_ia32
  47 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
  48 cglobal FLAC__lpc_restore_signal_wide_asm_ia32
  49
  50         code_section
  51
  52 ; **********************************************************************
  53 ;
  54 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
  55 ; {
  56 ;       FLAC__real d;
  57 ;       unsigned sample, coeff;
  58 ;       const unsigned limit = data_len - lag;
  59 ;
  60 ;       FLAC__ASSERT(lag > 0);
  61 ;       FLAC__ASSERT(lag <= data_len);
  62 ;
  63 ;       for(coeff = 0; coeff < lag; coeff++)
  64 ;               autoc[coeff] = 0.0;
  65 ;       for(sample = 0; sample <= limit; sample++) {
  66 ;               d = data[sample];
  67 ;               for(coeff = 0; coeff < lag; coeff++)
  68 ;                       autoc[coeff] += d * data[sample+coeff];
  69 ;       }
  70 ;       for(; sample < data_len; sample++) {
  71 ;               d = data[sample];
  72 ;               for(coeff = 0; coeff < data_len - sample; coeff++)
  73 ;                       autoc[coeff] += d * data[sample+coeff];
  74 ;       }
  75 ; }
  76 ;
  77         ALIGN 16
  78 cident FLAC__lpc_compute_autocorrelation_asm_ia32
  79         ;[esp + 28] == autoc[]
  80         ;[esp + 24] == lag
  81         ;[esp + 20] == data_len
  82         ;[esp + 16] == data[]
  83
  84         ;ASSERT(lag > 0)
  85         ;ASSERT(lag <= 33)
  86         ;ASSERT(lag <= data_len)
  87
  88 .begin:
  89         push    esi
  90         push    edi
  91         push    ebx
  92
  93         ;       for(coeff = 0; coeff < lag; coeff++)
  94         ;               autoc[coeff] = 0.0;
  95         mov     edi, [esp + 28]                 ; edi == autoc
  96         mov     ecx, [esp + 24]                 ; ecx = # of dwords (=lag) of 0 to write
  97         xor     eax, eax
  98         rep     stosd
  99
 100         ;       const unsigned limit = data_len - lag;
 101         mov     eax, [esp + 24]                 ; eax == lag
 102         mov     ecx, [esp + 20]
 103         sub     ecx, eax                        ; ecx == limit
 104
 105         mov     edi, [esp + 28]                 ; edi == autoc
 106         mov     esi, [esp + 16]                 ; esi == data
 107         inc     ecx                             ; we are looping <= limit so we add one to the counter
 108
 109         ;       for(sample = 0; sample <= limit; sample++) {
 110         ;               d = data[sample];
 111         ;               for(coeff = 0; coeff < lag; coeff++)
 112         ;                       autoc[coeff] += d * data[sample+coeff];
 113         ;       }
 114         fld     dword [esi]                     ; ST = d <- data[sample]
 115         ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 116         lea     edx, [eax + eax*2]
 117         neg     edx
 118         lea     edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
 119         call    .mov_eip_to_ebx
 120 .get_eip1:
 121         add     edx, ebx
 122         inc     edx                             ; compensate for the shorter opcode on the last iteration
 123         inc     edx                             ; compensate for the shorter opcode on the last iteration
 124         inc     edx                             ; compensate for the shorter opcode on the last iteration
 125         cmp     eax, 33
 126         jne     .loop1_start
 127         sub     edx, byte 9                     ; compensate for the longer opcodes on the first iteration
 128 .loop1_start:
 129         jmp     edx
 130
 131 .mov_eip_to_ebx:
 132         mov     ebx, [esp]
 133         ret
 134
 135         fld     st0                             ; ST = d d
 136         fmul    dword [esi + (32*4)]            ; ST = d*data[sample+32] d              WATCHOUT: not a byte displacement here!
 137         fadd    dword [edi + (32*4)]            ; ST = autoc[32]+d*data[sample+32] d    WATCHOUT: not a byte displacement here!
 138         fstp    dword [edi + (32*4)]            ; autoc[32]+=d*data[sample+32]  ST = d  WATCHOUT: not a byte displacement here!
 139         fld     st0                             ; ST = d d
 140         fmul    dword [esi + (31*4)]            ; ST = d*data[sample+31] d
 141         fadd    dword [edi + (31*4)]            ; ST = autoc[31]+d*data[sample+31] d
 142         fstp    dword [edi + (31*4)]            ; autoc[31]+=d*data[sample+31]  ST = d
 143         fld     st0                             ; ST = d d
 144         fmul    dword [esi + (30*4)]            ; ST = d*data[sample+30] d
 145         fadd    dword [edi + (30*4)]            ; ST = autoc[30]+d*data[sample+30] d
 146         fstp    dword [edi + (30*4)]            ; autoc[30]+=d*data[sample+30]  ST = d
 147         fld     st0                             ; ST = d d
 148         fmul    dword [esi + (29*4)]            ; ST = d*data[sample+29] d
 149         fadd    dword [edi + (29*4)]            ; ST = autoc[29]+d*data[sample+29] d
 150         fstp    dword [edi + (29*4)]            ; autoc[29]+=d*data[sample+29]  ST = d
 151         fld     st0                             ; ST = d d
 152         fmul    dword [esi + (28*4)]            ; ST = d*data[sample+28] d
 153         fadd    dword [edi + (28*4)]            ; ST = autoc[28]+d*data[sample+28] d
 154         fstp    dword [edi + (28*4)]            ; autoc[28]+=d*data[sample+28]  ST = d
 155         fld     st0                             ; ST = d d
 156         fmul    dword [esi + (27*4)]            ; ST = d*data[sample+27] d
 157         fadd    dword [edi + (27*4)]            ; ST = autoc[27]+d*data[sample+27] d
 158         fstp    dword [edi + (27*4)]            ; autoc[27]+=d*data[sample+27]  ST = d
 159         fld     st0                             ; ST = d d
 160         fmul    dword [esi + (26*4)]            ; ST = d*data[sample+26] d
 161         fadd    dword [edi + (26*4)]            ; ST = autoc[26]+d*data[sample+26] d
 162         fstp    dword [edi + (26*4)]            ; autoc[26]+=d*data[sample+26]  ST = d
 163         fld     st0                             ; ST = d d
 164         fmul    dword [esi + (25*4)]            ; ST = d*data[sample+25] d
 165         fadd    dword [edi + (25*4)]            ; ST = autoc[25]+d*data[sample+25] d
 166         fstp    dword [edi + (25*4)]            ; autoc[25]+=d*data[sample+25]  ST = d
 167         fld     st0                             ; ST = d d
 168         fmul    dword [esi + (24*4)]            ; ST = d*data[sample+24] d
 169         fadd    dword [edi + (24*4)]            ; ST = autoc[24]+d*data[sample+24] d
 170         fstp    dword [edi + (24*4)]            ; autoc[24]+=d*data[sample+24]  ST = d
 171         fld     st0                             ; ST = d d
 172         fmul    dword [esi + (23*4)]            ; ST = d*data[sample+23] d
 173         fadd    dword [edi + (23*4)]            ; ST = autoc[23]+d*data[sample+23] d
 174         fstp    dword [edi + (23*4)]            ; autoc[23]+=d*data[sample+23]  ST = d
 175         fld     st0                             ; ST = d d
 176         fmul    dword [esi + (22*4)]            ; ST = d*data[sample+22] d
 177         fadd    dword [edi + (22*4)]            ; ST = autoc[22]+d*data[sample+22] d
 178         fstp    dword [edi + (22*4)]            ; autoc[22]+=d*data[sample+22]  ST = d
 179         fld     st0                             ; ST = d d
 180         fmul    dword [esi + (21*4)]            ; ST = d*data[sample+21] d
 181         fadd    dword [edi + (21*4)]            ; ST = autoc[21]+d*data[sample+21] d
 182         fstp    dword [edi + (21*4)]            ; autoc[21]+=d*data[sample+21]  ST = d
 183         fld     st0                             ; ST = d d
 184         fmul    dword [esi + (20*4)]            ; ST = d*data[sample+20] d
 185         fadd    dword [edi + (20*4)]            ; ST = autoc[20]+d*data[sample+20] d
 186         fstp    dword [edi + (20*4)]            ; autoc[20]+=d*data[sample+20]  ST = d
 187         fld     st0                             ; ST = d d
 188         fmul    dword [esi + (19*4)]            ; ST = d*data[sample+19] d
 189         fadd    dword [edi + (19*4)]            ; ST = autoc[19]+d*data[sample+19] d
 190         fstp    dword [edi + (19*4)]            ; autoc[19]+=d*data[sample+19]  ST = d
 191         fld     st0                             ; ST = d d
 192         fmul    dword [esi + (18*4)]            ; ST = d*data[sample+18] d
 193         fadd    dword [edi + (18*4)]            ; ST = autoc[18]+d*data[sample+18] d
 194         fstp    dword [edi + (18*4)]            ; autoc[18]+=d*data[sample+18]  ST = d
 195         fld     st0                             ; ST = d d
 196         fmul    dword [esi + (17*4)]            ; ST = d*data[sample+17] d
 197         fadd    dword [edi + (17*4)]            ; ST = autoc[17]+d*data[sample+17] d
 198         fstp    dword [edi + (17*4)]            ; autoc[17]+=d*data[sample+17]  ST = d
 199         fld     st0                             ; ST = d d
 200         fmul    dword [esi + (16*4)]            ; ST = d*data[sample+16] d
 201         fadd    dword [edi + (16*4)]            ; ST = autoc[16]+d*data[sample+16] d
 202         fstp    dword [edi + (16*4)]            ; autoc[16]+=d*data[sample+16]  ST = d
 203         fld     st0                             ; ST = d d
 204         fmul    dword [esi + (15*4)]            ; ST = d*data[sample+15] d
 205         fadd    dword [edi + (15*4)]            ; ST = autoc[15]+d*data[sample+15] d
 206         fstp    dword [edi + (15*4)]            ; autoc[15]+=d*data[sample+15]  ST = d
 207         fld     st0                             ; ST = d d
 208         fmul    dword [esi + (14*4)]            ; ST = d*data[sample+14] d
 209         fadd    dword [edi + (14*4)]            ; ST = autoc[14]+d*data[sample+14] d
 210         fstp    dword [edi + (14*4)]            ; autoc[14]+=d*data[sample+14]  ST = d
 211         fld     st0                             ; ST = d d
 212         fmul    dword [esi + (13*4)]            ; ST = d*data[sample+13] d
 213         fadd    dword [edi + (13*4)]            ; ST = autoc[13]+d*data[sample+13] d
 214         fstp    dword [edi + (13*4)]            ; autoc[13]+=d*data[sample+13]  ST = d
 215         fld     st0                             ; ST = d d
 216         fmul    dword [esi + (12*4)]            ; ST = d*data[sample+12] d
 217         fadd    dword [edi + (12*4)]            ; ST = autoc[12]+d*data[sample+12] d
 218         fstp    dword [edi + (12*4)]            ; autoc[12]+=d*data[sample+12]  ST = d
 219         fld     st0                             ; ST = d d
 220         fmul    dword [esi + (11*4)]            ; ST = d*data[sample+11] d
 221         fadd    dword [edi + (11*4)]            ; ST = autoc[11]+d*data[sample+11] d
 222         fstp    dword [edi + (11*4)]            ; autoc[11]+=d*data[sample+11]  ST = d
 223         fld     st0                             ; ST = d d
 224         fmul    dword [esi + (10*4)]            ; ST = d*data[sample+10] d
 225         fadd    dword [edi + (10*4)]            ; ST = autoc[10]+d*data[sample+10] d
 226         fstp    dword [edi + (10*4)]            ; autoc[10]+=d*data[sample+10]  ST = d
 227         fld     st0                             ; ST = d d
 228         fmul    dword [esi + ( 9*4)]            ; ST = d*data[sample+9] d
 229         fadd    dword [edi + ( 9*4)]            ; ST = autoc[9]+d*data[sample+9] d
 230         fstp    dword [edi + ( 9*4)]            ; autoc[9]+=d*data[sample+9]  ST = d
 231         fld     st0                             ; ST = d d
 232         fmul    dword [esi + ( 8*4)]            ; ST = d*data[sample+8] d
 233         fadd    dword [edi + ( 8*4)]            ; ST = autoc[8]+d*data[sample+8] d
 234         fstp    dword [edi + ( 8*4)]            ; autoc[8]+=d*data[sample+8]  ST = d
 235         fld     st0                             ; ST = d d
 236         fmul    dword [esi + ( 7*4)]            ; ST = d*data[sample+7] d
 237         fadd    dword [edi + ( 7*4)]            ; ST = autoc[7]+d*data[sample+7] d
 238         fstp    dword [edi + ( 7*4)]            ; autoc[7]+=d*data[sample+7]  ST = d
 239         fld     st0                             ; ST = d d
 240         fmul    dword [esi + ( 6*4)]            ; ST = d*data[sample+6] d
 241         fadd    dword [edi + ( 6*4)]            ; ST = autoc[6]+d*data[sample+6] d
 242         fstp    dword [edi + ( 6*4)]            ; autoc[6]+=d*data[sample+6]  ST = d
 243         fld     st0                             ; ST = d d
 244         fmul    dword [esi + ( 5*4)]            ; ST = d*data[sample+4] d
 245         fadd    dword [edi + ( 5*4)]            ; ST = autoc[4]+d*data[sample+4] d
 246         fstp    dword [edi + ( 5*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 247         fld     st0                             ; ST = d d
 248         fmul    dword [esi + ( 4*4)]            ; ST = d*data[sample+4] d
 249         fadd    dword [edi + ( 4*4)]            ; ST = autoc[4]+d*data[sample+4] d
 250         fstp    dword [edi + ( 4*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 251         fld     st0                             ; ST = d d
 252         fmul    dword [esi + ( 3*4)]            ; ST = d*data[sample+3] d
 253         fadd    dword [edi + ( 3*4)]            ; ST = autoc[3]+d*data[sample+3] d
 254         fstp    dword [edi + ( 3*4)]            ; autoc[3]+=d*data[sample+3]  ST = d
 255         fld     st0                             ; ST = d d
 256         fmul    dword [esi + ( 2*4)]            ; ST = d*data[sample+2] d
 257         fadd    dword [edi + ( 2*4)]            ; ST = autoc[2]+d*data[sample+2] d
 258         fstp    dword [edi + ( 2*4)]            ; autoc[2]+=d*data[sample+2]  ST = d
 259         fld     st0                             ; ST = d d
 260         fmul    dword [esi + ( 1*4)]            ; ST = d*data[sample+1] d
 261         fadd    dword [edi + ( 1*4)]            ; ST = autoc[1]+d*data[sample+1] d
 262         fstp    dword [edi + ( 1*4)]            ; autoc[1]+=d*data[sample+1]  ST = d
 263         fld     st0                             ; ST = d d
 264         fmul    dword [esi]                     ; ST = d*data[sample] d                 WATCHOUT: no displacement byte here!
 265         fadd    dword [edi]                     ; ST = autoc[0]+d*data[sample] d        WATCHOUT: no displacement byte here!
 266         fstp    dword [edi]                     ; autoc[0]+=d*data[sample]  ST = d      WATCHOUT: no displacement byte here!
 267 .jumper1_0:
 268
 269         fstp    st0                             ; pop d, ST = empty
 270         add     esi, byte 4                     ; sample++
 271         dec     ecx
 272         jz      .loop1_end
 273         fld     dword [esi]                     ; ST = d <- data[sample]
 274         jmp     edx
 275 .loop1_end:
 276
 277         ;       for(; sample < data_len; sample++) {
 278         ;               d = data[sample];
 279         ;               for(coeff = 0; coeff < data_len - sample; coeff++)
 280         ;                       autoc[coeff] += d * data[sample+coeff];
 281         ;       }
 282         mov     ecx, [esp + 24]                 ; ecx <- lag
 283         dec     ecx                             ; ecx <- lag - 1
 284         jz      near .end                       ; skip loop if 0 (i.e. lag == 1)
 285
 286         fld     dword [esi]                     ; ST = d <- data[sample]
 287         mov     eax, ecx                        ; eax <- lag - 1 == data_len - sample the first time through
 288         ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 289         lea     edx, [eax + eax*2]
 290         neg     edx
 291         lea     edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
 292         call    .mov_eip_to_ebx
 293 .get_eip2:
 294         add     edx, ebx
 295         inc     edx                             ; compensate for the shorter opcode on the last iteration
 296         inc     edx                             ; compensate for the shorter opcode on the last iteration
 297         inc     edx                             ; compensate for the shorter opcode on the last iteration
 298         jmp     edx
 299
 300         fld     st0                             ; ST = d d
 301         fmul    dword [esi + (31*4)]            ; ST = d*data[sample+31] d
 302         fadd    dword [edi + (31*4)]            ; ST = autoc[31]+d*data[sample+31] d
 303         fstp    dword [edi + (31*4)]            ; autoc[31]+=d*data[sample+31]  ST = d
 304         fld     st0                             ; ST = d d
 305         fmul    dword [esi + (30*4)]            ; ST = d*data[sample+30] d
 306         fadd    dword [edi + (30*4)]            ; ST = autoc[30]+d*data[sample+30] d
 307         fstp    dword [edi + (30*4)]            ; autoc[30]+=d*data[sample+30]  ST = d
 308         fld     st0                             ; ST = d d
 309         fmul    dword [esi + (29*4)]            ; ST = d*data[sample+29] d
 310         fadd    dword [edi + (29*4)]            ; ST = autoc[29]+d*data[sample+29] d
 311         fstp    dword [edi + (29*4)]            ; autoc[29]+=d*data[sample+29]  ST = d
 312         fld     st0                             ; ST = d d
 313         fmul    dword [esi + (28*4)]            ; ST = d*data[sample+28] d
 314         fadd    dword [edi + (28*4)]            ; ST = autoc[28]+d*data[sample+28] d
 315         fstp    dword [edi + (28*4)]            ; autoc[28]+=d*data[sample+28]  ST = d
 316         fld     st0                             ; ST = d d
 317         fmul    dword [esi + (27*4)]            ; ST = d*data[sample+27] d
 318         fadd    dword [edi + (27*4)]            ; ST = autoc[27]+d*data[sample+27] d
 319         fstp    dword [edi + (27*4)]            ; autoc[27]+=d*data[sample+27]  ST = d
 320         fld     st0                             ; ST = d d
 321         fmul    dword [esi + (26*4)]            ; ST = d*data[sample+26] d
 322         fadd    dword [edi + (26*4)]            ; ST = autoc[26]+d*data[sample+26] d
 323         fstp    dword [edi + (26*4)]            ; autoc[26]+=d*data[sample+26]  ST = d
 324         fld     st0                             ; ST = d d
 325         fmul    dword [esi + (25*4)]            ; ST = d*data[sample+25] d
 326         fadd    dword [edi + (25*4)]            ; ST = autoc[25]+d*data[sample+25] d
 327         fstp    dword [edi + (25*4)]            ; autoc[25]+=d*data[sample+25]  ST = d
 328         fld     st0                             ; ST = d d
 329         fmul    dword [esi + (24*4)]            ; ST = d*data[sample+24] d
 330         fadd    dword [edi + (24*4)]            ; ST = autoc[24]+d*data[sample+24] d
 331         fstp    dword [edi + (24*4)]            ; autoc[24]+=d*data[sample+24]  ST = d
 332         fld     st0                             ; ST = d d
 333         fmul    dword [esi + (23*4)]            ; ST = d*data[sample+23] d
 334         fadd    dword [edi + (23*4)]            ; ST = autoc[23]+d*data[sample+23] d
 335         fstp    dword [edi + (23*4)]            ; autoc[23]+=d*data[sample+23]  ST = d
 336         fld     st0                             ; ST = d d
 337         fmul    dword [esi + (22*4)]            ; ST = d*data[sample+22] d
 338         fadd    dword [edi + (22*4)]            ; ST = autoc[22]+d*data[sample+22] d
 339         fstp    dword [edi + (22*4)]            ; autoc[22]+=d*data[sample+22]  ST = d
 340         fld     st0                             ; ST = d d
 341         fmul    dword [esi + (21*4)]            ; ST = d*data[sample+21] d
 342         fadd    dword [edi + (21*4)]            ; ST = autoc[21]+d*data[sample+21] d
 343         fstp    dword [edi + (21*4)]            ; autoc[21]+=d*data[sample+21]  ST = d
 344         fld     st0                             ; ST = d d
 345         fmul    dword [esi + (20*4)]            ; ST = d*data[sample+20] d
 346         fadd    dword [edi + (20*4)]            ; ST = autoc[20]+d*data[sample+20] d
 347         fstp    dword [edi + (20*4)]            ; autoc[20]+=d*data[sample+20]  ST = d
 348         fld     st0                             ; ST = d d
 349         fmul    dword [esi + (19*4)]            ; ST = d*data[sample+19] d
 350         fadd    dword [edi + (19*4)]            ; ST = autoc[19]+d*data[sample+19] d
 351         fstp    dword [edi + (19*4)]            ; autoc[19]+=d*data[sample+19]  ST = d
 352         fld     st0                             ; ST = d d
 353         fmul    dword [esi + (18*4)]            ; ST = d*data[sample+18] d
 354         fadd    dword [edi + (18*4)]            ; ST = autoc[18]+d*data[sample+18] d
 355         fstp    dword [edi + (18*4)]            ; autoc[18]+=d*data[sample+18]  ST = d
 356         fld     st0                             ; ST = d d
 357         fmul    dword [esi + (17*4)]            ; ST = d*data[sample+17] d
 358         fadd    dword [edi + (17*4)]            ; ST = autoc[17]+d*data[sample+17] d
 359         fstp    dword [edi + (17*4)]            ; autoc[17]+=d*data[sample+17]  ST = d
 360         fld     st0                             ; ST = d d
 361         fmul    dword [esi + (16*4)]            ; ST = d*data[sample+16] d
 362         fadd    dword [edi + (16*4)]            ; ST = autoc[16]+d*data[sample+16] d
 363         fstp    dword [edi + (16*4)]            ; autoc[16]+=d*data[sample+16]  ST = d
 364         fld     st0                             ; ST = d d
 365         fmul    dword [esi + (15*4)]            ; ST = d*data[sample+15] d
 366         fadd    dword [edi + (15*4)]            ; ST = autoc[15]+d*data[sample+15] d
 367         fstp    dword [edi + (15*4)]            ; autoc[15]+=d*data[sample+15]  ST = d
 368         fld     st0                             ; ST = d d
 369         fmul    dword [esi + (14*4)]            ; ST = d*data[sample+14] d
 370         fadd    dword [edi + (14*4)]            ; ST = autoc[14]+d*data[sample+14] d
 371         fstp    dword [edi + (14*4)]            ; autoc[14]+=d*data[sample+14]  ST = d
 372         fld     st0                             ; ST = d d
 373         fmul    dword [esi + (13*4)]            ; ST = d*data[sample+13] d
 374         fadd    dword [edi + (13*4)]            ; ST = autoc[13]+d*data[sample+13] d
 375         fstp    dword [edi + (13*4)]            ; autoc[13]+=d*data[sample+13]  ST = d
 376         fld     st0                             ; ST = d d
 377         fmul    dword [esi + (12*4)]            ; ST = d*data[sample+12] d
 378         fadd    dword [edi + (12*4)]            ; ST = autoc[12]+d*data[sample+12] d
 379         fstp    dword [edi + (12*4)]            ; autoc[12]+=d*data[sample+12]  ST = d
 380         fld     st0                             ; ST = d d
 381         fmul    dword [esi + (11*4)]            ; ST = d*data[sample+11] d
 382         fadd    dword [edi + (11*4)]            ; ST = autoc[11]+d*data[sample+11] d
 383         fstp    dword [edi + (11*4)]            ; autoc[11]+=d*data[sample+11]  ST = d
 384         fld     st0                             ; ST = d d
 385         fmul    dword [esi + (10*4)]            ; ST = d*data[sample+10] d
 386         fadd    dword [edi + (10*4)]            ; ST = autoc[10]+d*data[sample+10] d
 387         fstp    dword [edi + (10*4)]            ; autoc[10]+=d*data[sample+10]  ST = d
 388         fld     st0                             ; ST = d d
 389         fmul    dword [esi + ( 9*4)]            ; ST = d*data[sample+9] d
 390         fadd    dword [edi + ( 9*4)]            ; ST = autoc[9]+d*data[sample+9] d
 391         fstp    dword [edi + ( 9*4)]            ; autoc[9]+=d*data[sample+9]  ST = d
 392         fld     st0                             ; ST = d d
 393         fmul    dword [esi + ( 8*4)]            ; ST = d*data[sample+8] d
 394         fadd    dword [edi + ( 8*4)]            ; ST = autoc[8]+d*data[sample+8] d
 395         fstp    dword [edi + ( 8*4)]            ; autoc[8]+=d*data[sample+8]  ST = d
 396         fld     st0                             ; ST = d d
 397         fmul    dword [esi + ( 7*4)]            ; ST = d*data[sample+7] d
 398         fadd    dword [edi + ( 7*4)]            ; ST = autoc[7]+d*data[sample+7] d
 399         fstp    dword [edi + ( 7*4)]            ; autoc[7]+=d*data[sample+7]  ST = d
 400         fld     st0                             ; ST = d d
 401         fmul    dword [esi + ( 6*4)]            ; ST = d*data[sample+6] d
 402         fadd    dword [edi + ( 6*4)]            ; ST = autoc[6]+d*data[sample+6] d
 403         fstp    dword [edi + ( 6*4)]            ; autoc[6]+=d*data[sample+6]  ST = d
 404         fld     st0                             ; ST = d d
 405         fmul    dword [esi + ( 5*4)]            ; ST = d*data[sample+4] d
 406         fadd    dword [edi + ( 5*4)]            ; ST = autoc[4]+d*data[sample+4] d
 407         fstp    dword [edi + ( 5*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 408         fld     st0                             ; ST = d d
 409         fmul    dword [esi + ( 4*4)]            ; ST = d*data[sample+4] d
 410         fadd    dword [edi + ( 4*4)]            ; ST = autoc[4]+d*data[sample+4] d
 411         fstp    dword [edi + ( 4*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 412         fld     st0                             ; ST = d d
 413         fmul    dword [esi + ( 3*4)]            ; ST = d*data[sample+3] d
 414         fadd    dword [edi + ( 3*4)]            ; ST = autoc[3]+d*data[sample+3] d
 415         fstp    dword [edi + ( 3*4)]            ; autoc[3]+=d*data[sample+3]  ST = d
 416         fld     st0                             ; ST = d d
 417         fmul    dword [esi + ( 2*4)]            ; ST = d*data[sample+2] d
 418         fadd    dword [edi + ( 2*4)]            ; ST = autoc[2]+d*data[sample+2] d
 419         fstp    dword [edi + ( 2*4)]            ; autoc[2]+=d*data[sample+2]  ST = d
 420         fld     st0                             ; ST = d d
 421         fmul    dword [esi + ( 1*4)]            ; ST = d*data[sample+1] d
 422         fadd    dword [edi + ( 1*4)]            ; ST = autoc[1]+d*data[sample+1] d
 423         fstp    dword [edi + ( 1*4)]            ; autoc[1]+=d*data[sample+1]  ST = d
 424         fld     st0                             ; ST = d d
 425         fmul    dword [esi]                     ; ST = d*data[sample] d                 WATCHOUT: no displacement byte here!
 426         fadd    dword [edi]                     ; ST = autoc[0]+d*data[sample] d        WATCHOUT: no displacement byte here!
 427         fstp    dword [edi]                     ; autoc[0]+=d*data[sample]  ST = d      WATCHOUT: no displacement byte here!
 428 .jumper2_0:
 429
 430         fstp    st0                             ; pop d, ST = empty
 431         add     esi, byte 4                     ; sample++
 432         dec     ecx
 433         jz      .loop2_end
 434         add     edx, byte 11                    ; adjust our inner loop counter by adjusting the jump target
 435         fld     dword [esi]                     ; ST = d <- data[sample]
 436         jmp     edx
 437 .loop2_end:
 438
 439 .end:
 440         pop     ebx
 441         pop     edi
 442         pop     esi
 443         ret
 444
 445         ALIGN 16
 446 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
 447         ;[esp + 16] == autoc[]
 448         ;[esp + 12] == lag
 449         ;[esp + 8] == data_len
 450         ;[esp + 4] == data[]
 451
 452         ;ASSERT(lag > 0)
 453         ;ASSERT(lag <= 4)
 454         ;ASSERT(lag <= data_len)
 455
 456         ;       for(coeff = 0; coeff < lag; coeff++)
 457         ;               autoc[coeff] = 0.0;
 458         xorps   xmm5, xmm5
 459
 460         mov     edx, [esp + 8]                  ; edx == data_len
 461         mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
 462
 463         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 464         add     eax, 4
 465         movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
 466         shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 467 .warmup:                                        ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
 468         mulps   xmm0, xmm2                      ; xmm0 = xmm0 * xmm2
 469         addps   xmm5, xmm0                      ; xmm5 += xmm0 * xmm2
 470         dec     edx
 471         jz      .loop_end
 472         ALIGN 16
 473 .loop_start:
 474         ; start by reading the next sample
 475         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
 476         add     eax, 4
 477         shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
 478         shufps  xmm2, xmm2, 93h                 ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
 479         movss   xmm2, xmm0
 480         mulps   xmm0, xmm2                      ; xmm0 = xmm0 * xmm2
 481         addps   xmm5, xmm0                      ; xmm5 += xmm0 * xmm2
 482         dec     edx
 483         jnz     .loop_start
 484 .loop_end:
 485         ; store autoc
 486         mov     edx, [esp + 16]                 ; edx == autoc
 487         movups  [edx], xmm5
 488
 489 .end:
 490         ret
 491
 492         ALIGN 16
 493 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
 494         ;[esp + 16] == autoc[]
 495         ;[esp + 12] == lag
 496         ;[esp + 8] == data_len
 497         ;[esp + 4] == data[]
 498
 499         ;ASSERT(lag > 0)
 500         ;ASSERT(lag <= 8)
 501         ;ASSERT(lag <= data_len)
 502
 503         ;       for(coeff = 0; coeff < lag; coeff++)
 504         ;               autoc[coeff] = 0.0;
 505         xorps   xmm5, xmm5
 506         xorps   xmm6, xmm6
 507
 508         mov     edx, [esp + 8]                  ; edx == data_len
 509         mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
 510
 511         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 512         add     eax, 4
 513         movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
 514         shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 515         movaps  xmm1, xmm0                      ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 516         xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
 517 .warmup:                                        ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
 518         mulps   xmm0, xmm2
 519         mulps   xmm1, xmm3                      ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
 520         addps   xmm5, xmm0
 521         addps   xmm6, xmm1                      ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
 522         dec     edx
 523         jz      .loop_end
 524         ALIGN 16
 525 .loop_start:
 526         ; start by reading the next sample
 527         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
 528         ; here we reorder the instructions; see the (#) indexes for a logical order
 529         shufps  xmm2, xmm2, 93h                 ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
 530         add     eax, 4                          ; (0)
 531         shufps  xmm3, xmm3, 93h                 ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
 532         shufps  xmm0, xmm0, 0                   ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
 533         movss   xmm3, xmm2                      ; (5)
 534         movaps  xmm1, xmm0                      ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
 535         movss   xmm2, xmm0                      ; (6)
 536         mulps   xmm1, xmm3                      ; (8)
 537         mulps   xmm0, xmm2                      ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
 538         addps   xmm6, xmm1                      ; (10)
 539         addps   xmm5, xmm0                      ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
 540         dec     edx
 541         jnz     .loop_start
 542 .loop_end:
 543         ; store autoc
 544         mov     edx, [esp + 16]                 ; edx == autoc
 545         movups  [edx], xmm5
 546         movups  [edx + 16], xmm6
 547
 548 .end:
 549         ret
 550
 551         ALIGN 16
 552 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
 553         ;[esp + 16] == autoc[]
 554         ;[esp + 12] == lag
 555         ;[esp + 8] == data_len
 556         ;[esp + 4] == data[]
 557
 558         ;ASSERT(lag > 0)
 559         ;ASSERT(lag <= 12)
 560         ;ASSERT(lag <= data_len)
 561
 562         ;       for(coeff = 0; coeff < lag; coeff++)
 563         ;               autoc[coeff] = 0.0;
 564         xorps   xmm5, xmm5
 565         xorps   xmm6, xmm6
 566         xorps   xmm7, xmm7
 567
 568         mov     edx, [esp + 8]                  ; edx == data_len
 569         mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
 570
 571         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 572         add     eax, 4
 573         movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
 574         shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 575         xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
 576         xorps   xmm4, xmm4                      ; xmm4 = 0,0,0,0
 577 .warmup:                                        ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
 578         movaps  xmm1, xmm0
 579         mulps   xmm1, xmm2
 580         addps   xmm5, xmm1
 581         movaps  xmm1, xmm0
 582         mulps   xmm1, xmm3
 583         addps   xmm6, xmm1
 584         mulps   xmm0, xmm4
 585         addps   xmm7, xmm0                      ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
 586         dec     edx
 587         jz      .loop_end
 588         ALIGN 16
 589 .loop_start:
 590         ; start by reading the next sample
 591         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
 592         add     eax, 4
 593         shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
 594
 595         ; shift xmm4:xmm3:xmm2 left by one float
 596         shufps  xmm2, xmm2, 93h                 ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
 597         shufps  xmm3, xmm3, 93h                 ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
 598         shufps  xmm4, xmm4, 93h                 ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
 599         movss   xmm4, xmm3
 600         movss   xmm3, xmm2
 601         movss   xmm2, xmm0
 602
 603         ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
 604         movaps  xmm1, xmm0
 605         mulps   xmm1, xmm2
 606         addps   xmm5, xmm1
 607         movaps  xmm1, xmm0
 608         mulps   xmm1, xmm3
 609         addps   xmm6, xmm1
 610         mulps   xmm0, xmm4
 611         addps   xmm7, xmm0
 612
 613         dec     edx
 614         jnz     .loop_start
 615 .loop_end:
 616         ; store autoc
 617         mov     edx, [esp + 16]                 ; edx == autoc
 618         movups  [edx], xmm5
 619         movups  [edx + 16], xmm6
 620         movups  [edx + 32], xmm7
 621
 622 .end:
 623         ret
 624
 625         ALIGN 16
 626 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
 627         ;[ebp + 20] == autoc[]
 628         ;[ebp + 16] == lag
 629         ;[ebp + 12] == data_len
 630         ;[ebp +  8] == data[]
 631         ;[esp] == __m128
 632         ;[esp + 16] == __m128
 633
 634         push    ebp
 635         mov     ebp, esp
 636         and     esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
 637         sub     esp, 32
 638
 639         ;ASSERT(lag > 0)
 640         ;ASSERT(lag <= 12)
 641         ;ASSERT(lag <= data_len)
 642         ;ASSERT(data_len > 0)
 643
 644         ;       for(coeff = 0; coeff < lag; coeff++)
 645         ;               autoc[coeff] = 0.0;
 646         xorps   xmm5, xmm5
 647         xorps   xmm6, xmm6
 648         movaps  [esp], xmm5
 649         movaps  [esp + 16], xmm6
 650
 651         mov     edx, [ebp + 12]                 ; edx == data_len
 652         mov     eax, [ebp +  8]                 ; eax == &data[sample] <- &data[0]
 653
 654         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 655         add     eax, 4
 656         movaps  xmm1, xmm0                      ; xmm1 = 0,0,0,data[0]
 657         shufps  xmm0, xmm0, 0           ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 658         xorps   xmm2, xmm2                      ; xmm2 = 0,0,0,0
 659         xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
 660         xorps   xmm4, xmm4                      ; xmm4 = 0,0,0,0
 661         movaps  xmm7, xmm0
 662         mulps   xmm7, xmm1
 663         addps   xmm5, xmm7
 664         dec     edx
 665         jz      .loop_end
 666         ALIGN 16
 667 .loop_start:
 668         ; start by reading the next sample
 669         movss   xmm0, [eax]                             ; xmm0 = 0,0,0,data[sample]
 670         add     eax, 4
 671         shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
 672
 673         ; shift xmm4:xmm3:xmm2:xmm1 left by one float
 674         shufps  xmm1, xmm1, 93h
 675         shufps  xmm2, xmm2, 93h
 676         shufps  xmm3, xmm3, 93h
 677         shufps  xmm4, xmm4, 93h
 678         movss   xmm4, xmm3
 679         movss   xmm3, xmm2
 680         movss   xmm2, xmm1
 681         movss   xmm1, xmm0
 682
 683         ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
 684         movaps  xmm7, xmm0
 685         mulps   xmm7, xmm1
 686         addps   xmm5, xmm7
 687         movaps  xmm7, xmm0
 688         mulps   xmm7, xmm2
 689         addps   xmm6, xmm7
 690         movaps  xmm7, xmm0
 691         mulps   xmm7, xmm3
 692         mulps   xmm0, xmm4
 693         addps   xmm7, [esp]
 694         addps   xmm0, [esp + 16]
 695         movaps  [esp], xmm7
 696         movaps  [esp + 16], xmm0
 697
 698         dec     edx
 699         jnz     .loop_start
 700 .loop_end:
 701         ; store autoc
 702         mov     edx, [ebp + 20]                         ; edx == autoc
 703         movups  [edx], xmm5
 704         movups  [edx + 16], xmm6
 705         movaps  xmm5, [esp]
 706         movaps  xmm6, [esp + 16]
 707         movups  [edx + 32], xmm5
 708         movups  [edx + 48], xmm6
 709 .end:
 710         mov     esp, ebp
 711         pop     ebp
 712         ret
 713
 714 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 715 ;
 716 ;       for(i = 0; i < data_len; i++) {
 717 ;               sum = 0;
 718 ;               for(j = 0; j < order; j++)
 719 ;                       sum += qlp_coeff[j] * data[i-j-1];
 720 ;               residual[i] = data[i] - (sum >> lp_quantization);
 721 ;       }
 722 ;
 723         ALIGN   16
 724 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 725         ;[esp + 40]     residual[]
 726         ;[esp + 36]     lp_quantization
 727         ;[esp + 32]     order
 728         ;[esp + 28]     qlp_coeff[]
 729         ;[esp + 24]     data_len
 730         ;[esp + 20]     data[]
 731
 732         ;ASSERT(order > 0)
 733
 734         push    ebp
 735         push    ebx
 736         push    esi
 737         push    edi
 738
 739         mov     esi, [esp + 20]                 ; esi = data[]
 740         mov     edi, [esp + 40]                 ; edi = residual[]
 741         mov     eax, [esp + 32]                 ; eax = order
 742         mov     ebx, [esp + 24]                 ; ebx = data_len
 743
 744         test    ebx, ebx
 745         jz      near .end                       ; do nothing if data_len == 0
 746 .begin:
 747         cmp     eax, byte 1
 748         jg      short .i_1more
 749
 750         mov     ecx, [esp + 28]
 751         mov     edx, [ecx]                      ; edx = qlp_coeff[0]
 752         mov     eax, [esi - 4]                  ; eax = data[-1]
 753         mov     ecx, [esp + 36]                 ; cl = lp_quantization
 754         ALIGN   16
 755 .i_1_loop_i:
 756         imul    eax, edx
 757         sar     eax, cl
 758         neg     eax
 759         add     eax, [esi]
 760         mov     [edi], eax
 761         mov     eax, [esi]
 762         add     edi, byte 4
 763         add     esi, byte 4
 764         dec     ebx
 765         jnz     .i_1_loop_i
 766
 767         jmp     .end
 768
 769 .i_1more:
 770         cmp     eax, byte 32                    ; for order <= 32 there is a faster routine
 771         jbe     short .i_32
 772
 773         ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
 774         ALIGN 16
 775 .i_32more_loop_i:
 776         xor     ebp, ebp
 777         mov     ecx, [esp + 32]
 778         mov     edx, ecx
 779         shl     edx, 2
 780         add     edx, [esp + 28]
 781         neg     ecx
 782         ALIGN   16
 783 .i_32more_loop_j:
 784         sub     edx, byte 4
 785         mov     eax, [edx]
 786         imul    eax, [esi + 4 * ecx]
 787         add     ebp, eax
 788         inc     ecx
 789         jnz     short .i_32more_loop_j
 790
 791         mov     ecx, [esp + 36]
 792         sar     ebp, cl
 793         neg     ebp
 794         add     ebp, [esi]
 795         mov     [edi], ebp
 796         add     esi, byte 4
 797         add     edi, byte 4
 798
 799         dec     ebx
 800         jnz     .i_32more_loop_i
 801
 802         jmp     .end
 803
 804 .mov_eip_to_eax:
 805         mov     eax, [esp]
 806         ret
 807
 808 .i_32:
 809         sub     edi, esi
 810         neg     eax
 811         lea     edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
 812         call    .mov_eip_to_eax
 813 .get_eip0:
 814         add     edx, eax
 815         inc     edx
 816         mov     eax, [esp + 28]                 ; eax = qlp_coeff[]
 817         xor     ebp, ebp
 818         jmp     edx
 819
 820         mov     ecx, [eax + 124]
 821         imul    ecx, [esi - 128]
 822         add     ebp, ecx
 823         mov     ecx, [eax + 120]
 824         imul    ecx, [esi - 124]
 825         add     ebp, ecx
 826         mov     ecx, [eax + 116]
 827         imul    ecx, [esi - 120]
 828         add     ebp, ecx
 829         mov     ecx, [eax + 112]
 830         imul    ecx, [esi - 116]
 831         add     ebp, ecx
 832         mov     ecx, [eax + 108]
 833         imul    ecx, [esi - 112]
 834         add     ebp, ecx
 835         mov     ecx, [eax + 104]
 836         imul    ecx, [esi - 108]
 837         add     ebp, ecx
 838         mov     ecx, [eax + 100]
 839         imul    ecx, [esi - 104]
 840         add     ebp, ecx
 841         mov     ecx, [eax + 96]
 842         imul    ecx, [esi - 100]
 843         add     ebp, ecx
 844         mov     ecx, [eax + 92]
 845         imul    ecx, [esi - 96]
 846         add     ebp, ecx
 847         mov     ecx, [eax + 88]
 848         imul    ecx, [esi - 92]
 849         add     ebp, ecx
 850         mov     ecx, [eax + 84]
 851         imul    ecx, [esi - 88]
 852         add     ebp, ecx
 853         mov     ecx, [eax + 80]
 854         imul    ecx, [esi - 84]
 855         add     ebp, ecx
 856         mov     ecx, [eax + 76]
 857         imul    ecx, [esi - 80]
 858         add     ebp, ecx
 859         mov     ecx, [eax + 72]
 860         imul    ecx, [esi - 76]
 861         add     ebp, ecx
 862         mov     ecx, [eax + 68]
 863         imul    ecx, [esi - 72]
 864         add     ebp, ecx
 865         mov     ecx, [eax + 64]
 866         imul    ecx, [esi - 68]
 867         add     ebp, ecx
 868         mov     ecx, [eax + 60]
 869         imul    ecx, [esi - 64]
 870         add     ebp, ecx
 871         mov     ecx, [eax + 56]
 872         imul    ecx, [esi - 60]
 873         add     ebp, ecx
 874         mov     ecx, [eax + 52]
 875         imul    ecx, [esi - 56]
 876         add     ebp, ecx
 877         mov     ecx, [eax + 48]
 878         imul    ecx, [esi - 52]
 879         add     ebp, ecx
 880         mov     ecx, [eax + 44]
 881         imul    ecx, [esi - 48]
 882         add     ebp, ecx
 883         mov     ecx, [eax + 40]
 884         imul    ecx, [esi - 44]
 885         add     ebp, ecx
 886         mov     ecx, [eax + 36]
 887         imul    ecx, [esi - 40]
 888         add     ebp, ecx
 889         mov     ecx, [eax + 32]
 890         imul    ecx, [esi - 36]
 891         add     ebp, ecx
 892         mov     ecx, [eax + 28]
 893         imul    ecx, [esi - 32]
 894         add     ebp, ecx
 895         mov     ecx, [eax + 24]
 896         imul    ecx, [esi - 28]
 897         add     ebp, ecx
 898         mov     ecx, [eax + 20]
 899         imul    ecx, [esi - 24]
 900         add     ebp, ecx
 901         mov     ecx, [eax + 16]
 902         imul    ecx, [esi - 20]
 903         add     ebp, ecx
 904         mov     ecx, [eax + 12]
 905         imul    ecx, [esi - 16]
 906         add     ebp, ecx
 907         mov     ecx, [eax + 8]
 908         imul    ecx, [esi - 12]
 909         add     ebp, ecx
 910         mov     ecx, [eax + 4]
 911         imul    ecx, [esi - 8]
 912         add     ebp, ecx
 913         mov     ecx, [eax]                      ; there is one byte missing
 914         imul    ecx, [esi - 4]
 915         add     ebp, ecx
 916 .jumper_0:
 917
 918         mov     ecx, [esp + 36]
 919         sar     ebp, cl
 920         neg     ebp
 921         add     ebp, [esi]
 922         mov     [edi + esi], ebp
 923         add     esi, byte 4
 924
 925         dec     ebx
 926         jz      short .end
 927         xor     ebp, ebp
 928         jmp     edx
 929
 930 .end:
 931         pop     edi
 932         pop     esi
 933         pop     ebx
 934         pop     ebp
 935         ret
 936
 937 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
 938 ; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
 939 ; cannot be used for side-channel coded 16bps channels since the effective bps
 940 ; is 17.
 941         ALIGN   16
 942 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 943         ;[esp + 40]     residual[]
 944         ;[esp + 36]     lp_quantization
 945         ;[esp + 32]     order
 946         ;[esp + 28]     qlp_coeff[]
 947         ;[esp + 24]     data_len
 948         ;[esp + 20]     data[]
 949
 950         ;ASSERT(order > 0)
 951
 952         push    ebp
 953         push    ebx
 954         push    esi
 955         push    edi
 956
 957         mov     esi, [esp + 20]                 ; esi = data[]
 958         mov     edi, [esp + 40]                 ; edi = residual[]
 959         mov     eax, [esp + 32]                 ; eax = order
 960         mov     ebx, [esp + 24]                 ; ebx = data_len
 961
 962         test    ebx, ebx
 963         jz      near .end                       ; do nothing if data_len == 0
 964         dec     ebx
 965         test    ebx, ebx
 966         jz      near .last_one
 967
 968         mov     edx, [esp + 28]                 ; edx = qlp_coeff[]
 969         movd    mm6, [esp + 36]                 ; mm6 = 0:lp_quantization
 970         mov     ebp, esp
 971
 972         and     esp, 0xfffffff8
 973
 974         xor     ecx, ecx
 975 .copy_qlp_loop:
 976         push    word [edx + 4 * ecx]
 977         inc     ecx
 978         cmp     ecx, eax
 979         jnz     short .copy_qlp_loop
 980
 981         and     ecx, 0x3
 982         test    ecx, ecx
 983         je      short .za_end
 984         sub     ecx, byte 4
 985 .za_loop:
 986         push    word 0
 987         inc     eax
 988         inc     ecx
 989         jnz     short .za_loop
 990 .za_end:
 991
 992         movq    mm5, [esp + 2 * eax - 8]
 993         movd    mm4, [esi - 16]
 994         punpckldq       mm4, [esi - 12]
 995         movd    mm0, [esi - 8]
 996         punpckldq       mm0, [esi - 4]
 997         packssdw        mm4, mm0
 998
 999         cmp     eax, byte 4
1000         jnbe    short .mmx_4more
1001
1002         ALIGN   16
1003 .mmx_4_loop_i:
1004         movd    mm1, [esi]
1005         movq    mm3, mm4
1006         punpckldq       mm1, [esi + 4]
1007         psrlq   mm4, 16
1008         movq    mm0, mm1
1009         psllq   mm0, 48
1010         por     mm4, mm0
1011         movq    mm2, mm4
1012         psrlq   mm4, 16
1013         pxor    mm0, mm0
1014         punpckhdq       mm0, mm1
1015         pmaddwd mm3, mm5
1016         pmaddwd mm2, mm5
1017         psllq   mm0, 16
1018         por     mm4, mm0
1019         movq    mm0, mm3
1020         punpckldq       mm3, mm2
1021         punpckhdq       mm0, mm2
1022         paddd   mm3, mm0
1023         psrad   mm3, mm6
1024         psubd   mm1, mm3
1025         movd    [edi], mm1
1026         punpckhdq       mm1, mm1
1027         movd    [edi + 4], mm1
1028
1029         add     edi, byte 8
1030         add     esi, byte 8
1031
1032         sub     ebx, 2
1033         jg      .mmx_4_loop_i
1034         jmp     .mmx_end
1035
1036 .mmx_4more:
1037         shl     eax, 2
1038         neg     eax
1039         add     eax, byte 16
1040
1041         ALIGN   16
1042 .mmx_4more_loop_i:
1043         movd    mm1, [esi]
1044         punpckldq       mm1, [esi + 4]
1045         movq    mm3, mm4
1046         psrlq   mm4, 16
1047         movq    mm0, mm1
1048         psllq   mm0, 48
1049         por     mm4, mm0
1050         movq    mm2, mm4
1051         psrlq   mm4, 16
1052         pxor    mm0, mm0
1053         punpckhdq       mm0, mm1
1054         pmaddwd mm3, mm5
1055         pmaddwd mm2, mm5
1056         psllq   mm0, 16
1057         por     mm4, mm0
1058
1059         mov     ecx, esi
1060         add     ecx, eax
1061         mov     edx, esp
1062
1063         ALIGN   16
1064 .mmx_4more_loop_j:
1065         movd    mm0, [ecx - 16]
1066         movd    mm7, [ecx - 8]
1067         punpckldq       mm0, [ecx - 12]
1068         punpckldq       mm7, [ecx - 4]
1069         packssdw        mm0, mm7
1070         pmaddwd mm0, [edx]
1071         punpckhdq       mm7, mm7
1072         paddd   mm3, mm0
1073         movd    mm0, [ecx - 12]
1074         punpckldq       mm0, [ecx - 8]
1075         punpckldq       mm7, [ecx]
1076         packssdw        mm0, mm7
1077         pmaddwd mm0, [edx]
1078         paddd   mm2, mm0
1079
1080         add     edx, byte 8
1081         add     ecx, byte 16
1082         cmp     ecx, esi
1083         jnz     .mmx_4more_loop_j
1084
1085         movq    mm0, mm3
1086         punpckldq       mm3, mm2
1087         punpckhdq       mm0, mm2
1088         paddd   mm3, mm0
1089         psrad   mm3, mm6
1090         psubd   mm1, mm3
1091         movd    [edi], mm1
1092         punpckhdq       mm1, mm1
1093         movd    [edi + 4], mm1
1094
1095         add     edi, byte 8
1096         add     esi, byte 8
1097
1098         sub     ebx, 2
1099         jg      near .mmx_4more_loop_i
1100
1101 .mmx_end:
1102         emms
1103         mov     esp, ebp
1104 .last_one:
1105         mov     eax, [esp + 32]
1106         inc     ebx
1107         jnz     near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1108
1109 .end:
1110         pop     edi
1111         pop     esi
1112         pop     ebx
1113         pop     ebp
1114         ret
1115
1116 ; **********************************************************************
1117 ;
1118 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1119 ; {
1120 ;       unsigned i, j;
1121 ;       FLAC__int32 sum;
1122 ;
1123 ;       FLAC__ASSERT(order > 0);
1124 ;
1125 ;       for(i = 0; i < data_len; i++) {
1126 ;               sum = 0;
1127 ;               for(j = 0; j < order; j++)
1128 ;                       sum += qlp_coeff[j] * data[i-j-1];
1129 ;               data[i] = residual[i] + (sum >> lp_quantization);
1130 ;       }
1131 ; }
1132         ALIGN   16
1133 cident FLAC__lpc_restore_signal_asm_ia32
1134         ;[esp + 40]     data[]
1135         ;[esp + 36]     lp_quantization
1136         ;[esp + 32]     order
1137         ;[esp + 28]     qlp_coeff[]
1138         ;[esp + 24]     data_len
1139         ;[esp + 20]     residual[]
1140
1141         ;ASSERT(order > 0)
1142
1143         push    ebp
1144         push    ebx
1145         push    esi
1146         push    edi
1147
1148         mov     esi, [esp + 20]                 ; esi = residual[]
1149         mov     edi, [esp + 40]                 ; edi = data[]
1150         mov     eax, [esp + 32]                 ; eax = order
1151         mov     ebx, [esp + 24]                 ; ebx = data_len
1152
1153         test    ebx, ebx
1154         jz      near .end                       ; do nothing if data_len == 0
1155
1156 .begin:
1157         cmp     eax, byte 1
1158         jg      short .x87_1more
1159
1160         mov     ecx, [esp + 28]
1161         mov     edx, [ecx]
1162         mov     eax, [edi - 4]
1163         mov     ecx, [esp + 36]
1164         ALIGN   16
1165 .x87_1_loop_i:
1166         imul    eax, edx
1167         sar     eax, cl
1168         add     eax, [esi]
1169         mov     [edi], eax
1170         add     esi, byte 4
1171         add     edi, byte 4
1172         dec     ebx
1173         jnz     .x87_1_loop_i
1174
1175         jmp     .end
1176
1177 .x87_1more:
1178         cmp     eax, byte 32                    ; for order <= 32 there is a faster routine
1179         jbe     short .x87_32
1180
1181         ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1182         ALIGN 16
1183 .x87_32more_loop_i:
1184         xor     ebp, ebp
1185         mov     ecx, [esp + 32]
1186         mov     edx, ecx
1187         shl     edx, 2
1188         add     edx, [esp + 28]
1189         neg     ecx
1190         ALIGN   16
1191 .x87_32more_loop_j:
1192         sub     edx, byte 4
1193         mov     eax, [edx]
1194         imul    eax, [edi + 4 * ecx]
1195         add     ebp, eax
1196         inc     ecx
1197         jnz     short .x87_32more_loop_j
1198
1199         mov     ecx, [esp + 36]
1200         sar     ebp, cl
1201         add     ebp, [esi]
1202         mov     [edi], ebp
1203         add     edi, byte 4
1204         add     esi, byte 4
1205
1206         dec     ebx
1207         jnz     .x87_32more_loop_i
1208
1209         jmp     .end
1210
1211 .mov_eip_to_eax:
1212         mov     eax, [esp]
1213         ret
1214
1215 .x87_32:
1216         sub     esi, edi
1217         neg     eax
1218         lea     edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1219         call    .mov_eip_to_eax
1220 .get_eip0:
1221         add     edx, eax
1222         inc     edx                             ; compensate for the shorter opcode on the last iteration
1223         mov     eax, [esp + 28]                 ; eax = qlp_coeff[]
1224         xor     ebp, ebp
1225         jmp     edx
1226
1227         mov     ecx, [eax + 124]                ; ecx =  qlp_coeff[31]
1228         imul    ecx, [edi - 128]                ; ecx =  qlp_coeff[31] * data[i-32]
1229         add     ebp, ecx                        ; sum += qlp_coeff[31] * data[i-32]
1230         mov     ecx, [eax + 120]                ; ecx =  qlp_coeff[30]
1231         imul    ecx, [edi - 124]                ; ecx =  qlp_coeff[30] * data[i-31]
1232         add     ebp, ecx                        ; sum += qlp_coeff[30] * data[i-31]
1233         mov     ecx, [eax + 116]                ; ecx =  qlp_coeff[29]
1234         imul    ecx, [edi - 120]                ; ecx =  qlp_coeff[29] * data[i-30]
1235         add     ebp, ecx                        ; sum += qlp_coeff[29] * data[i-30]
1236         mov     ecx, [eax + 112]                ; ecx =  qlp_coeff[28]
1237         imul    ecx, [edi - 116]                ; ecx =  qlp_coeff[28] * data[i-29]
1238         add     ebp, ecx                        ; sum += qlp_coeff[28] * data[i-29]
1239         mov     ecx, [eax + 108]                ; ecx =  qlp_coeff[27]
1240         imul    ecx, [edi - 112]                ; ecx =  qlp_coeff[27] * data[i-28]
1241         add     ebp, ecx                        ; sum += qlp_coeff[27] * data[i-28]
1242         mov     ecx, [eax + 104]                ; ecx =  qlp_coeff[26]
1243         imul    ecx, [edi - 108]                ; ecx =  qlp_coeff[26] * data[i-27]
1244         add     ebp, ecx                        ; sum += qlp_coeff[26] * data[i-27]
1245         mov     ecx, [eax + 100]                ; ecx =  qlp_coeff[25]
1246         imul    ecx, [edi - 104]                ; ecx =  qlp_coeff[25] * data[i-26]
1247         add     ebp, ecx                        ; sum += qlp_coeff[25] * data[i-26]
1248         mov     ecx, [eax + 96]                 ; ecx =  qlp_coeff[24]
1249         imul    ecx, [edi - 100]                ; ecx =  qlp_coeff[24] * data[i-25]
1250         add     ebp, ecx                        ; sum += qlp_coeff[24] * data[i-25]
1251         mov     ecx, [eax + 92]                 ; ecx =  qlp_coeff[23]
1252         imul    ecx, [edi - 96]                 ; ecx =  qlp_coeff[23] * data[i-24]
1253         add     ebp, ecx                        ; sum += qlp_coeff[23] * data[i-24]
1254         mov     ecx, [eax + 88]                 ; ecx =  qlp_coeff[22]
1255         imul    ecx, [edi - 92]                 ; ecx =  qlp_coeff[22] * data[i-23]
1256         add     ebp, ecx                        ; sum += qlp_coeff[22] * data[i-23]
1257         mov     ecx, [eax + 84]                 ; ecx =  qlp_coeff[21]
1258         imul    ecx, [edi - 88]                 ; ecx =  qlp_coeff[21] * data[i-22]
1259         add     ebp, ecx                        ; sum += qlp_coeff[21] * data[i-22]
1260         mov     ecx, [eax + 80]                 ; ecx =  qlp_coeff[20]
1261         imul    ecx, [edi - 84]                 ; ecx =  qlp_coeff[20] * data[i-21]
1262         add     ebp, ecx                        ; sum += qlp_coeff[20] * data[i-21]
1263         mov     ecx, [eax + 76]                 ; ecx =  qlp_coeff[19]
1264         imul    ecx, [edi - 80]                 ; ecx =  qlp_coeff[19] * data[i-20]
1265         add     ebp, ecx                        ; sum += qlp_coeff[19] * data[i-20]
1266         mov     ecx, [eax + 72]                 ; ecx =  qlp_coeff[18]
1267         imul    ecx, [edi - 76]                 ; ecx =  qlp_coeff[18] * data[i-19]
1268         add     ebp, ecx                        ; sum += qlp_coeff[18] * data[i-19]
1269         mov     ecx, [eax + 68]                 ; ecx =  qlp_coeff[17]
1270         imul    ecx, [edi - 72]                 ; ecx =  qlp_coeff[17] * data[i-18]
1271         add     ebp, ecx                        ; sum += qlp_coeff[17] * data[i-18]
1272         mov     ecx, [eax + 64]                 ; ecx =  qlp_coeff[16]
1273         imul    ecx, [edi - 68]                 ; ecx =  qlp_coeff[16] * data[i-17]
1274         add     ebp, ecx                        ; sum += qlp_coeff[16] * data[i-17]
1275         mov     ecx, [eax + 60]                 ; ecx =  qlp_coeff[15]
1276         imul    ecx, [edi - 64]                 ; ecx =  qlp_coeff[15] * data[i-16]
1277         add     ebp, ecx                        ; sum += qlp_coeff[15] * data[i-16]
1278         mov     ecx, [eax + 56]                 ; ecx =  qlp_coeff[14]
1279         imul    ecx, [edi - 60]                 ; ecx =  qlp_coeff[14] * data[i-15]
1280         add     ebp, ecx                        ; sum += qlp_coeff[14] * data[i-15]
1281         mov     ecx, [eax + 52]                 ; ecx =  qlp_coeff[13]
1282         imul    ecx, [edi - 56]                 ; ecx =  qlp_coeff[13] * data[i-14]
1283         add     ebp, ecx                        ; sum += qlp_coeff[13] * data[i-14]
1284         mov     ecx, [eax + 48]                 ; ecx =  qlp_coeff[12]
1285         imul    ecx, [edi - 52]                 ; ecx =  qlp_coeff[12] * data[i-13]
1286         add     ebp, ecx                        ; sum += qlp_coeff[12] * data[i-13]
1287         mov     ecx, [eax + 44]                 ; ecx =  qlp_coeff[11]
1288         imul    ecx, [edi - 48]                 ; ecx =  qlp_coeff[11] * data[i-12]
1289         add     ebp, ecx                        ; sum += qlp_coeff[11] * data[i-12]
1290         mov     ecx, [eax + 40]                 ; ecx =  qlp_coeff[10]
1291         imul    ecx, [edi - 44]                 ; ecx =  qlp_coeff[10] * data[i-11]
1292         add     ebp, ecx                        ; sum += qlp_coeff[10] * data[i-11]
1293         mov     ecx, [eax + 36]                 ; ecx =  qlp_coeff[ 9]
1294         imul    ecx, [edi - 40]                 ; ecx =  qlp_coeff[ 9] * data[i-10]
1295         add     ebp, ecx                        ; sum += qlp_coeff[ 9] * data[i-10]
1296         mov     ecx, [eax + 32]                 ; ecx =  qlp_coeff[ 8]
1297         imul    ecx, [edi - 36]                 ; ecx =  qlp_coeff[ 8] * data[i- 9]
1298         add     ebp, ecx                        ; sum += qlp_coeff[ 8] * data[i- 9]
1299         mov     ecx, [eax + 28]                 ; ecx =  qlp_coeff[ 7]
1300         imul    ecx, [edi - 32]                 ; ecx =  qlp_coeff[ 7] * data[i- 8]
1301         add     ebp, ecx                        ; sum += qlp_coeff[ 7] * data[i- 8]
1302         mov     ecx, [eax + 24]                 ; ecx =  qlp_coeff[ 6]
1303         imul    ecx, [edi - 28]                 ; ecx =  qlp_coeff[ 6] * data[i- 7]
1304         add     ebp, ecx                        ; sum += qlp_coeff[ 6] * data[i- 7]
1305         mov     ecx, [eax + 20]                 ; ecx =  qlp_coeff[ 5]
1306         imul    ecx, [edi - 24]                 ; ecx =  qlp_coeff[ 5] * data[i- 6]
1307         add     ebp, ecx                        ; sum += qlp_coeff[ 5] * data[i- 6]
1308         mov     ecx, [eax + 16]                 ; ecx =  qlp_coeff[ 4]
1309         imul    ecx, [edi - 20]                 ; ecx =  qlp_coeff[ 4] * data[i- 5]
1310         add     ebp, ecx                        ; sum += qlp_coeff[ 4] * data[i- 5]
1311         mov     ecx, [eax + 12]                 ; ecx =  qlp_coeff[ 3]
1312         imul    ecx, [edi - 16]                 ; ecx =  qlp_coeff[ 3] * data[i- 4]
1313         add     ebp, ecx                        ; sum += qlp_coeff[ 3] * data[i- 4]
1314         mov     ecx, [eax + 8]                  ; ecx =  qlp_coeff[ 2]
1315         imul    ecx, [edi - 12]                 ; ecx =  qlp_coeff[ 2] * data[i- 3]
1316         add     ebp, ecx                        ; sum += qlp_coeff[ 2] * data[i- 3]
1317         mov     ecx, [eax + 4]                  ; ecx =  qlp_coeff[ 1]
1318         imul    ecx, [edi - 8]                  ; ecx =  qlp_coeff[ 1] * data[i- 2]
1319         add     ebp, ecx                        ; sum += qlp_coeff[ 1] * data[i- 2]
1320         mov     ecx, [eax]                      ; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1321         imul    ecx, [edi - 4]                  ; ecx =  qlp_coeff[ 0] * data[i- 1]
1322         add     ebp, ecx                        ; sum += qlp_coeff[ 0] * data[i- 1]
1323 .jumper_0:
1324
1325         mov     ecx, [esp + 36]
1326         sar     ebp, cl                         ; ebp = (sum >> lp_quantization)
1327         add     ebp, [esi + edi]                ; ebp = residual[i] + (sum >> lp_quantization)
1328         mov     [edi], ebp                      ; data[i] = residual[i] + (sum >> lp_quantization)
1329         add     edi, byte 4
1330
1331         dec     ebx
1332         jz      short .end
1333         xor     ebp, ebp
1334         jmp     edx
1335
1336 .end:
1337         pop     edi
1338         pop     esi
1339         pop     ebx
1340         pop     ebp
1341         ret
1342
1343 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1344 ; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
1345 ; cannot be used for side-channel coded 16bps channels since the effective bps
1346 ; is 17.
1347 ; WATCHOUT: this routine requires that each data array have a buffer of up to
1348 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1349 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1350         ALIGN   16
1351 cident FLAC__lpc_restore_signal_asm_ia32_mmx
1352         ;[esp + 40]     data[]
1353         ;[esp + 36]     lp_quantization
1354         ;[esp + 32]     order
1355         ;[esp + 28]     qlp_coeff[]
1356         ;[esp + 24]     data_len
1357         ;[esp + 20]     residual[]
1358
1359         ;ASSERT(order > 0)
1360
1361         push    ebp
1362         push    ebx
1363         push    esi
1364         push    edi
1365
1366         mov     esi, [esp + 20]
1367         mov     edi, [esp + 40]
1368         mov     eax, [esp + 32]
1369         mov     ebx, [esp + 24]
1370
1371         test    ebx, ebx
1372         jz      near .end                       ; do nothing if data_len == 0
1373         cmp     eax, byte 4
1374         jb      near FLAC__lpc_restore_signal_asm_ia32.begin
1375
1376         mov     edx, [esp + 28]
1377         movd    mm6, [esp + 36]
1378         mov     ebp, esp
1379
1380         and     esp, 0xfffffff8
1381
1382         xor     ecx, ecx
1383 .copy_qlp_loop:
1384         push    word [edx + 4 * ecx]
1385         inc     ecx
1386         cmp     ecx, eax
1387         jnz     short .copy_qlp_loop
1388
1389         and     ecx, 0x3
1390         test    ecx, ecx
1391         je      short .za_end
1392         sub     ecx, byte 4
1393 .za_loop:
1394         push    word 0
1395         inc     eax
1396         inc     ecx
1397         jnz     short .za_loop
1398 .za_end:
1399
1400         movq    mm5, [esp + 2 * eax - 8]
1401         movd    mm4, [edi - 16]
1402         punpckldq       mm4, [edi - 12]
1403         movd    mm0, [edi - 8]
1404         punpckldq       mm0, [edi - 4]
1405         packssdw        mm4, mm0
1406
1407         cmp     eax, byte 4
1408         jnbe    short .mmx_4more
1409
1410         ALIGN   16
1411 .mmx_4_loop_i:
1412         movq    mm7, mm4
1413         pmaddwd mm7, mm5
1414         movq    mm0, mm7
1415         punpckhdq       mm7, mm7
1416         paddd   mm7, mm0
1417         psrad   mm7, mm6
1418         movd    mm1, [esi]
1419         paddd   mm7, mm1
1420         movd    [edi], mm7
1421         psllq   mm7, 48
1422         psrlq   mm4, 16
1423         por     mm4, mm7
1424
1425         add     esi, byte 4
1426         add     edi, byte 4
1427
1428         dec     ebx
1429         jnz     .mmx_4_loop_i
1430         jmp     .mmx_end
1431 .mmx_4more:
1432         shl     eax, 2
1433         neg     eax
1434         add     eax, byte 16
1435         ALIGN   16
1436 .mmx_4more_loop_i:
1437         mov     ecx, edi
1438         add     ecx, eax
1439         mov     edx, esp
1440
1441         movq    mm7, mm4
1442         pmaddwd mm7, mm5
1443
1444         ALIGN   16
1445 .mmx_4more_loop_j:
1446         movd    mm0, [ecx - 16]
1447         punpckldq       mm0, [ecx - 12]
1448         movd    mm1, [ecx - 8]
1449         punpckldq       mm1, [ecx - 4]
1450         packssdw        mm0, mm1
1451         pmaddwd mm0, [edx]
1452         paddd   mm7, mm0
1453
1454         add     edx, byte 8
1455         add     ecx, byte 16
1456         cmp     ecx, edi
1457         jnz     .mmx_4more_loop_j
1458
1459         movq    mm0, mm7
1460         punpckhdq       mm7, mm7
1461         paddd   mm7, mm0
1462         psrad   mm7, mm6
1463         movd    mm1, [esi]
1464         paddd   mm7, mm1
1465         movd    [edi], mm7
1466         psllq   mm7, 48
1467         psrlq   mm4, 16
1468         por     mm4, mm7
1469
1470         add     esi, byte 4
1471         add     edi, byte 4
1472
1473         dec     ebx
1474         jnz     short .mmx_4more_loop_i
1475 .mmx_end:
1476         emms
1477         mov     esp, ebp
1478
1479 .end:
1480         pop     edi
1481         pop     esi
1482         pop     ebx
1483         pop     ebp
1484         ret
1485
1486
1487 ; **********************************************************************
1488 ;
1489 ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
1490 ; {
1491 ;       unsigned i, j;
1492 ;       FLAC__int64 sum;
1493 ;
1494 ;       FLAC__ASSERT(order > 0);
1495 ;
1496 ;       for(i = 0; i < data_len; i++) {
1497 ;               sum = 0;
1498 ;               for(j = 0; j < order; j++)
1499 ;                       sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1500 ;               residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1501 ;       }
1502 ; }
1503         ALIGN   16
1504 cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
1505         ;[esp + 40]     residual[]
1506         ;[esp + 36]     lp_quantization
1507         ;[esp + 32]     order
1508         ;[esp + 28]     qlp_coeff[]
1509         ;[esp + 24]     data_len
1510         ;[esp + 20]     data[]
1511
1512         ;ASSERT(order > 0)
1513         ;ASSERT(order <= 32)
1514         ;ASSERT(lp_quantization <= 31)
1515
1516         push    ebp
1517         push    ebx
1518         push    esi
1519         push    edi
1520
1521         mov     ebx, [esp + 24]                 ; ebx = data_len
1522         test    ebx, ebx
1523         jz      near .end                               ; do nothing if data_len == 0
1524
1525 .begin:
1526         mov     eax, [esp + 32]                 ; eax = order
1527         cmp     eax, 1
1528         jg      short .i_32
1529
1530         mov     esi, [esp + 40]                 ; esi = residual[]
1531         mov     edi, [esp + 20]                 ; edi = data[]
1532         mov     ecx, [esp + 28]                 ; ecx = qlp_coeff[]
1533         mov     ebp, [ecx]                              ; ebp = qlp_coeff[0]
1534         mov     eax, [edi - 4]                  ; eax = data[-1]
1535         mov     ecx, [esp + 36]                 ; cl = lp_quantization
1536         ALIGN   16
1537 .i_1_loop_i:
1538         imul    ebp                                     ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1539         shrd    eax, edx, cl            ; 0 <= lp_quantization <= 15
1540         neg     eax
1541         add     eax, [edi]
1542         mov     [esi], eax
1543         mov     eax, [edi]
1544         add     esi, 4
1545         add     edi, 4
1546         dec     ebx
1547         jnz     .i_1_loop_i
1548         jmp     .end
1549
1550 .mov_eip_to_eax:
1551         mov     eax, [esp]
1552         ret
1553
1554 .i_32:  ; eax = order
1555         neg     eax
1556         add     eax, eax
1557         lea     ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1558         call    .mov_eip_to_eax
1559 .get_eip0:
1560         add     ebp, eax
1561         inc     ebp                             ; compensate for the shorter opcode on the last iteration
1562
1563         mov     ebx, [esp + 28]                 ; ebx = qlp_coeff[]
1564         mov     edi, [esp + 20]                 ; edi = data[]
1565         sub     [esp + 40], edi                 ; residual[] -= data[]
1566
1567         xor     ecx, ecx
1568         xor     esi, esi
1569         jmp     ebp
1570
1571 ;eax = --
1572 ;edx = --
1573 ;ecx = 0
1574 ;esi = 0
1575 ;
1576 ;ebx = qlp_coeff[]
1577 ;edi = data[]
1578 ;ebp = @address
1579
1580         mov     eax, [ebx + 124]                        ; eax =  qlp_coeff[31]
1581         imul    dword [edi - 128]               ; edx:eax =  qlp_coeff[31] * data[i-32]
1582         add     ecx, eax
1583         adc     esi, edx                                        ; sum += qlp_coeff[31] * data[i-32]
1584
1585         mov     eax, [ebx + 120]                        ; eax =  qlp_coeff[30]
1586         imul    dword [edi - 124]               ; edx:eax =  qlp_coeff[30] * data[i-31]
1587         add     ecx, eax
1588         adc     esi, edx                                        ; sum += qlp_coeff[30] * data[i-31]
1589
1590         mov     eax, [ebx + 116]
1591         imul    dword [edi - 120]
1592         add     ecx, eax
1593         adc     esi, edx
1594
1595         mov     eax, [ebx + 112]
1596         imul    dword [edi - 116]
1597         add     ecx, eax
1598         adc     esi, edx
1599
1600         mov     eax, [ebx + 108]
1601         imul    dword [edi - 112]
1602         add     ecx, eax
1603         adc     esi, edx
1604
1605         mov     eax, [ebx + 104]
1606         imul    dword [edi - 108]
1607         add     ecx, eax
1608         adc     esi, edx
1609
1610         mov     eax, [ebx + 100]
1611         imul    dword [edi - 104]
1612         add     ecx, eax
1613         adc     esi, edx
1614
1615         mov     eax, [ebx + 96]
1616         imul    dword [edi - 100]
1617         add     ecx, eax
1618         adc     esi, edx
1619
1620         mov     eax, [ebx + 92]
1621         imul    dword [edi - 96]
1622         add     ecx, eax
1623         adc     esi, edx
1624
1625         mov     eax, [ebx + 88]
1626         imul    dword [edi - 92]
1627         add     ecx, eax
1628         adc     esi, edx
1629
1630         mov     eax, [ebx + 84]
1631         imul    dword [edi - 88]
1632         add     ecx, eax
1633         adc     esi, edx
1634
1635         mov     eax, [ebx + 80]
1636         imul    dword [edi - 84]
1637         add     ecx, eax
1638         adc     esi, edx
1639
1640         mov     eax, [ebx + 76]
1641         imul    dword [edi - 80]
1642         add     ecx, eax
1643         adc     esi, edx
1644
1645         mov     eax, [ebx + 72]
1646         imul    dword [edi - 76]
1647         add     ecx, eax
1648         adc     esi, edx
1649
1650         mov     eax, [ebx + 68]
1651         imul    dword [edi - 72]
1652         add     ecx, eax
1653         adc     esi, edx
1654
1655         mov     eax, [ebx + 64]
1656         imul    dword [edi - 68]
1657         add     ecx, eax
1658         adc     esi, edx
1659
1660         mov     eax, [ebx + 60]
1661         imul    dword [edi - 64]
1662         add     ecx, eax
1663         adc     esi, edx
1664
1665         mov     eax, [ebx + 56]
1666         imul    dword [edi - 60]
1667         add     ecx, eax
1668         adc     esi, edx
1669
1670         mov     eax, [ebx + 52]
1671         imul    dword [edi - 56]
1672         add     ecx, eax
1673         adc     esi, edx
1674
1675         mov     eax, [ebx + 48]
1676         imul    dword [edi - 52]
1677         add     ecx, eax
1678         adc     esi, edx
1679
1680         mov     eax, [ebx + 44]
1681         imul    dword [edi - 48]
1682         add     ecx, eax
1683         adc     esi, edx
1684
1685         mov     eax, [ebx + 40]
1686         imul    dword [edi - 44]
1687         add     ecx, eax
1688         adc     esi, edx
1689
1690         mov     eax, [ebx + 36]
1691         imul    dword [edi - 40]
1692         add     ecx, eax
1693         adc     esi, edx
1694
1695         mov     eax, [ebx + 32]
1696         imul    dword [edi - 36]
1697         add     ecx, eax
1698         adc     esi, edx
1699
1700         mov     eax, [ebx + 28]
1701         imul    dword [edi - 32]
1702         add     ecx, eax
1703         adc     esi, edx
1704
1705         mov     eax, [ebx + 24]
1706         imul    dword [edi - 28]
1707         add     ecx, eax
1708         adc     esi, edx
1709
1710         mov     eax, [ebx + 20]
1711         imul    dword [edi - 24]
1712         add     ecx, eax
1713         adc     esi, edx
1714
1715         mov     eax, [ebx + 16]
1716         imul    dword [edi - 20]
1717         add     ecx, eax
1718         adc     esi, edx
1719
1720         mov     eax, [ebx + 12]
1721         imul    dword [edi - 16]
1722         add     ecx, eax
1723         adc     esi, edx
1724
1725         mov     eax, [ebx + 8]
1726         imul    dword [edi - 12]
1727         add     ecx, eax
1728         adc     esi, edx
1729
1730         mov     eax, [ebx + 4]
1731         imul    dword [edi - 8]
1732         add     ecx, eax
1733         adc     esi, edx
1734
1735         mov     eax, [ebx]                                      ; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1736         imul    dword [edi - 4]                 ; edx:eax =  qlp_coeff[ 0] * data[i- 1]
1737         add     ecx, eax
1738         adc     esi, edx                                        ; sum += qlp_coeff[ 0] * data[i- 1]
1739
1740 .jumper_0:
1741         mov     edx, ecx
1742 ;esi:edx = sum
1743         mov     ecx, [esp + 36]                 ; cl = lp_quantization
1744         shrd    edx, esi, cl            ; edx = (sum >> lp_quantization)
1745 ;eax = --
1746 ;ecx = --
1747 ;edx = sum >> lp_q
1748 ;esi = --
1749         neg     edx                                             ; edx = -(sum >> lp_quantization)
1750         mov     eax, [esp + 40]                 ; residual[] - data[]
1751         add     edx, [edi]                              ; edx = data[i] - (sum >> lp_quantization)
1752         mov     [edi + eax], edx
1753         add     edi, 4
1754
1755         dec     dword [esp + 24]
1756         jz      short .end
1757         xor     ecx, ecx
1758         xor     esi, esi
1759         jmp     ebp
1760
1761 .end:
1762         pop     edi
1763         pop     esi
1764         pop     ebx
1765         pop     ebp
1766         ret
1767
1768 ; **********************************************************************
1769 ;
1770 ; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1771 ; {
1772 ;       unsigned i, j;
1773 ;       FLAC__int64 sum;
1774 ;
1775 ;       FLAC__ASSERT(order > 0);
1776 ;
1777 ;       for(i = 0; i < data_len; i++) {
1778 ;               sum = 0;
1779 ;               for(j = 0; j < order; j++)
1780 ;                       sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1781 ;               data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1782 ;       }
1783 ; }
1784         ALIGN   16
1785 cident FLAC__lpc_restore_signal_wide_asm_ia32
1786         ;[esp + 40]     data[]
1787         ;[esp + 36]     lp_quantization
1788         ;[esp + 32]     order
1789         ;[esp + 28]     qlp_coeff[]
1790         ;[esp + 24]     data_len
1791         ;[esp + 20]     residual[]
1792
1793         ;ASSERT(order > 0)
1794         ;ASSERT(order <= 32)
1795         ;ASSERT(lp_quantization <= 31)
1796
1797         push    ebp
1798         push    ebx
1799         push    esi
1800         push    edi
1801
1802         mov     ebx, [esp + 24]                 ; ebx = data_len
1803         test    ebx, ebx
1804         jz      near .end                               ; do nothing if data_len == 0
1805
1806 .begin:
1807         mov     eax, [esp + 32]                 ; eax = order
1808         cmp     eax, 1
1809         jg      short .x87_32
1810
1811         mov     esi, [esp + 20]                 ; esi = residual[]
1812         mov     edi, [esp + 40]                 ; edi = data[]
1813         mov     ecx, [esp + 28]                 ; ecx = qlp_coeff[]
1814         mov     ebp, [ecx]                              ; ebp = qlp_coeff[0]
1815         mov     eax, [edi - 4]                  ; eax = data[-1]
1816         mov     ecx, [esp + 36]                 ; cl = lp_quantization
1817         ALIGN   16
1818 .x87_1_loop_i:
1819         imul    ebp                                     ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1820         shrd    eax, edx, cl            ; 0 <= lp_quantization <= 15
1821 ;
1822         add     eax, [esi]
1823         mov     [edi], eax
1824 ;
1825         add     esi, 4
1826         add     edi, 4
1827         dec     ebx
1828         jnz     .x87_1_loop_i
1829         jmp     .end
1830
1831 .mov_eip_to_eax:
1832         mov     eax, [esp]
1833         ret
1834
1835 .x87_32:        ; eax = order
1836         neg     eax
1837         add     eax, eax
1838         lea     ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1839         call    .mov_eip_to_eax
1840 .get_eip0:
1841         add     ebp, eax
1842         inc     ebp                             ; compensate for the shorter opcode on the last iteration
1843
1844         mov     ebx, [esp + 28]                 ; ebx = qlp_coeff[]
1845         mov     edi, [esp + 40]                 ; esi = data[]
1846         sub     [esp + 20], edi                 ; residual[] -= data[]
1847
1848         xor     ecx, ecx
1849         xor     esi, esi
1850         jmp     ebp
1851
1852 ;eax = --
1853 ;edx = --
1854 ;ecx = 0
1855 ;esi = 0
1856 ;
1857 ;ebx = qlp_coeff[]
1858 ;edi = data[]
1859 ;ebp = @address
1860
1861         mov     eax, [ebx + 124]                        ; eax =  qlp_coeff[31]
1862         imul    dword [edi - 128]               ; edx:eax =  qlp_coeff[31] * data[i-32]
1863         add     ecx, eax
1864         adc     esi, edx                                        ; sum += qlp_coeff[31] * data[i-32]
1865
1866         mov     eax, [ebx + 120]                        ; eax =  qlp_coeff[30]
1867         imul    dword [edi - 124]               ; edx:eax =  qlp_coeff[30] * data[i-31]
1868         add     ecx, eax
1869         adc     esi, edx                                        ; sum += qlp_coeff[30] * data[i-31]
1870
1871         mov     eax, [ebx + 116]
1872         imul    dword [edi - 120]
1873         add     ecx, eax
1874         adc     esi, edx
1875
1876         mov     eax, [ebx + 112]
1877         imul    dword [edi - 116]
1878         add     ecx, eax
1879         adc     esi, edx
1880
1881         mov     eax, [ebx + 108]
1882         imul    dword [edi - 112]
1883         add     ecx, eax
1884         adc     esi, edx
1885
1886         mov     eax, [ebx + 104]
1887         imul    dword [edi - 108]
1888         add     ecx, eax
1889         adc     esi, edx
1890
1891         mov     eax, [ebx + 100]
1892         imul    dword [edi - 104]
1893         add     ecx, eax
1894         adc     esi, edx
1895
1896         mov     eax, [ebx + 96]
1897         imul    dword [edi - 100]
1898         add     ecx, eax
1899         adc     esi, edx
1900
1901         mov     eax, [ebx + 92]
1902         imul    dword [edi - 96]
1903         add     ecx, eax
1904         adc     esi, edx
1905
1906         mov     eax, [ebx + 88]
1907         imul    dword [edi - 92]
1908         add     ecx, eax
1909         adc     esi, edx
1910
1911         mov     eax, [ebx + 84]
1912         imul    dword [edi - 88]
1913         add     ecx, eax
1914         adc     esi, edx
1915
1916         mov     eax, [ebx + 80]
1917         imul    dword [edi - 84]
1918         add     ecx, eax
1919         adc     esi, edx
1920
1921         mov     eax, [ebx + 76]
1922         imul    dword [edi - 80]
1923         add     ecx, eax
1924         adc     esi, edx
1925
1926         mov     eax, [ebx + 72]
1927         imul    dword [edi - 76]
1928         add     ecx, eax
1929         adc     esi, edx
1930
1931         mov     eax, [ebx + 68]
1932         imul    dword [edi - 72]
1933         add     ecx, eax
1934         adc     esi, edx
1935
1936         mov     eax, [ebx + 64]
1937         imul    dword [edi - 68]
1938         add     ecx, eax
1939         adc     esi, edx
1940
1941         mov     eax, [ebx + 60]
1942         imul    dword [edi - 64]
1943         add     ecx, eax
1944         adc     esi, edx
1945
1946         mov     eax, [ebx + 56]
1947         imul    dword [edi - 60]
1948         add     ecx, eax
1949         adc     esi, edx
1950
1951         mov     eax, [ebx + 52]
1952         imul    dword [edi - 56]
1953         add     ecx, eax
1954         adc     esi, edx
1955
1956         mov     eax, [ebx + 48]
1957         imul    dword [edi - 52]
1958         add     ecx, eax
1959         adc     esi, edx
1960
1961         mov     eax, [ebx + 44]
1962         imul    dword [edi - 48]
1963         add     ecx, eax
1964         adc     esi, edx
1965
1966         mov     eax, [ebx + 40]
1967         imul    dword [edi - 44]
1968         add     ecx, eax
1969         adc     esi, edx
1970
1971         mov     eax, [ebx + 36]
1972         imul    dword [edi - 40]
1973         add     ecx, eax
1974         adc     esi, edx
1975
1976         mov     eax, [ebx + 32]
1977         imul    dword [edi - 36]
1978         add     ecx, eax
1979         adc     esi, edx
1980
1981         mov     eax, [ebx + 28]
1982         imul    dword [edi - 32]
1983         add     ecx, eax
1984         adc     esi, edx
1985
1986         mov     eax, [ebx + 24]
1987         imul    dword [edi - 28]
1988         add     ecx, eax
1989         adc     esi, edx
1990
1991         mov     eax, [ebx + 20]
1992         imul    dword [edi - 24]
1993         add     ecx, eax
1994         adc     esi, edx
1995
1996         mov     eax, [ebx + 16]
1997         imul    dword [edi - 20]
1998         add     ecx, eax
1999         adc     esi, edx
2000
2001         mov     eax, [ebx + 12]
2002         imul    dword [edi - 16]
2003         add     ecx, eax
2004         adc     esi, edx
2005
2006         mov     eax, [ebx + 8]
2007         imul    dword [edi - 12]
2008         add     ecx, eax
2009         adc     esi, edx
2010
2011         mov     eax, [ebx + 4]
2012         imul    dword [edi - 8]
2013         add     ecx, eax
2014         adc     esi, edx
2015
2016         mov     eax, [ebx]                                      ; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
2017         imul    dword [edi - 4]                 ; edx:eax =  qlp_coeff[ 0] * data[i- 1]
2018         add     ecx, eax
2019         adc     esi, edx                                        ; sum += qlp_coeff[ 0] * data[i- 1]
2020
2021 .jumper_0:
2022         mov     edx, ecx
2023 ;esi:edx = sum
2024         mov     ecx, [esp + 36]                 ; cl = lp_quantization
2025         shrd    edx, esi, cl            ; edx = (sum >> lp_quantization)
2026 ;eax = --
2027 ;ecx = --
2028 ;edx = sum >> lp_q
2029 ;esi = --
2030 ;
2031         mov     eax, [esp + 20]                 ; residual[] - data[]
2032         add     edx, [edi + eax]                ; edx = residual[i] + (sum >> lp_quantization)
2033         mov     [edi], edx                              ; data[i] = residual[i] + (sum >> lp_quantization)
2034         add     edi, 4
2035
2036         dec     dword [esp + 24]
2037         jz      short .end
2038         xor     ecx, ecx
2039         xor     esi, esi
2040         jmp     ebp
2041
2042 .end:
2043         pop     edi
2044         pop     esi
2045         pop     ebx
2046         pop     ebp
2047         ret
2048
2049 ; end