| 1 | ; vim:filetype=nasm ts=8 |
| 2 | |
| 3 | ; libFLAC - Free Lossless Audio Codec library |
| 4 | ; Copyright (C) 2001-2009 Josh Coalson |
| 5 | ; Copyright (C) 2011-2016 Xiph.Org Foundation |
| 6 | ; |
| 7 | ; Redistribution and use in source and binary forms, with or without |
| 8 | ; modification, are permitted provided that the following conditions |
| 9 | ; are met: |
| 10 | ; |
| 11 | ; - Redistributions of source code must retain the above copyright |
| 12 | ; notice, this list of conditions and the following disclaimer. |
| 13 | ; |
| 14 | ; - Redistributions in binary form must reproduce the above copyright |
| 15 | ; notice, this list of conditions and the following disclaimer in the |
| 16 | ; documentation and/or other materials provided with the distribution. |
| 17 | ; |
| 18 | ; - Neither the name of the Xiph.org Foundation nor the names of its |
| 19 | ; contributors may be used to endorse or promote products derived from |
| 20 | ; this software without specific prior written permission. |
| 21 | ; |
| 22 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 23 | ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 24 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 25 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| 26 | ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 27 | ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 28 | ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 29 | ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 30 | ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 31 | ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 32 | ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 33 | |
| 34 | %include "nasm.h" |
| 35 | |
| 36 | data_section |
| 37 | |
| 38 | cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov |
| 39 | |
| 40 | code_section |
| 41 | |
| 42 | ; ********************************************************************** |
| 43 | ; |
| 44 | ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) |
| 45 | ; { |
| 46 | ; FLAC__int32 last_error_0 = data[-1]; |
| 47 | ; FLAC__int32 last_error_1 = data[-1] - data[-2]; |
| 48 | ; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); |
| 49 | ; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); |
| 50 | ; FLAC__int32 error, save; |
| 51 | ; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; |
| 52 | ; unsigned i, order; |
| 53 | ; |
| 54 | ; for(i = 0; i < data_len; i++) { |
| 55 | ; error = data[i] ; total_error_0 += local_abs(error); save = error; |
| 56 | ; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; |
| 57 | ; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; |
| 58 | ; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; |
| 59 | ; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; |
| 60 | ; } |
| 61 | ; |
| 62 | ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) |
| 63 | ; order = 0; |
| 64 | ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) |
| 65 | ; order = 1; |
| 66 | ; else if(total_error_2 < min(total_error_3, total_error_4)) |
| 67 | ; order = 2; |
| 68 | ; else if(total_error_3 < total_error_4) |
| 69 | ; order = 3; |
| 70 | ; else |
| 71 | ; order = 4; |
| 72 | ; |
| 73 | ; residual_bits_per_sample[0] = (float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); |
| 74 | ; residual_bits_per_sample[1] = (float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); |
| 75 | ; residual_bits_per_sample[2] = (float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); |
| 76 | ; residual_bits_per_sample[3] = (float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); |
| 77 | ; residual_bits_per_sample[4] = (float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); |
| 78 | ; |
| 79 | ; return order; |
| 80 | ; } |
| 81 | ALIGN 16 |
| 82 | cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov |
| 83 | |
| 84 | ; esp + 36 == data[] |
| 85 | ; esp + 40 == data_len |
| 86 | ; esp + 44 == residual_bits_per_sample[] |
| 87 | |
| 88 | push ebp |
| 89 | push ebx |
| 90 | push esi |
| 91 | push edi |
| 92 | sub esp, byte 16 |
| 93 | ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs |
| 94 | |
| 95 | ; ebx == &data[i] |
| 96 | ; ecx == loop counter (i) |
| 97 | ; ebp == order |
| 98 | ; mm0 == total_error_1:total_error_0 |
| 99 | ; mm1 == total_error_2:total_error_3 |
| 100 | ; mm2 == :total_error_4 |
| 101 | ; mm3 == last_error_1:last_error_0 |
| 102 | ; mm4 == last_error_2:last_error_3 |
| 103 | |
| 104 | mov ecx, [esp + 40] ; ecx = data_len |
| 105 | test ecx, ecx |
| 106 | jz near .data_len_is_0 |
| 107 | |
| 108 | mov ebx, [esp + 36] ; ebx = data[] |
| 109 | movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 |
| 110 | movd mm2, [ebx - 8] ; mm2 = 0:data[-2] |
| 111 | movd mm1, [ebx - 12] ; mm1 = 0:data[-3] |
| 112 | movd mm0, [ebx - 16] ; mm0 = 0:data[-4] |
| 113 | movq mm5, mm3 ; mm5 = 0:last_error_0 |
| 114 | psubd mm5, mm2 ; mm5 = 0:last_error_1 |
| 115 | punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0 |
| 116 | psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] |
| 117 | psubd mm5, mm2 ; mm5 = 0:last_error_2 |
| 118 | movq mm4, mm5 ; mm4 = 0:last_error_2 |
| 119 | psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3]) |
| 120 | paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3]) |
| 121 | psubd mm4, mm0 ; mm4 = 0:last_error_3 |
| 122 | punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3 |
| 123 | pxor mm0, mm0 ; mm0 = total_error_1:total_error_0 |
| 124 | pxor mm1, mm1 ; mm1 = total_error_2:total_error_3 |
| 125 | pxor mm2, mm2 ; mm2 = 0:total_error_4 |
| 126 | |
| 127 | ALIGN 16 |
| 128 | .loop: |
| 129 | movd mm7, [ebx] ; mm7 = 0:error_0 |
| 130 | add ebx, byte 4 |
| 131 | movq mm6, mm7 ; mm6 = 0:error_0 |
| 132 | psubd mm7, mm3 ; mm7 = :error_1 |
| 133 | punpckldq mm6, mm7 ; mm6 = error_1:error_0 |
| 134 | movq mm5, mm6 ; mm5 = error_1:error_0 |
| 135 | movq mm7, mm6 ; mm7 = error_1:error_0 |
| 136 | psubd mm5, mm3 ; mm5 = error_2: |
| 137 | movq mm3, mm6 ; mm3 = error_1:error_0 |
| 138 | psrad mm6, 31 |
| 139 | pxor mm7, mm6 |
| 140 | psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0) |
| 141 | paddd mm0, mm7 ; mm0 = total_error_1:total_error_0 |
| 142 | movq mm6, mm5 ; mm6 = error_2: |
| 143 | psubd mm5, mm4 ; mm5 = error_3: |
| 144 | punpckhdq mm5, mm6 ; mm5 = error_2:error_3 |
| 145 | movq mm7, mm5 ; mm7 = error_2:error_3 |
| 146 | movq mm6, mm5 ; mm6 = error_2:error_3 |
| 147 | psubd mm5, mm4 ; mm5 = :error_4 |
| 148 | movq mm4, mm6 ; mm4 = error_2:error_3 |
| 149 | psrad mm6, 31 |
| 150 | pxor mm7, mm6 |
| 151 | psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3) |
| 152 | paddd mm1, mm7 ; mm1 = total_error_2:total_error_3 |
| 153 | movq mm6, mm5 ; mm6 = :error_4 |
| 154 | psrad mm5, 31 |
| 155 | pxor mm6, mm5 |
| 156 | psubd mm6, mm5 ; mm6 = :abs(error_4) |
| 157 | paddd mm2, mm6 ; mm2 = :total_error_4 |
| 158 | |
| 159 | dec ecx |
| 160 | jnz short .loop |
| 161 | |
| 162 | ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) |
| 163 | ; order = 0; |
| 164 | ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) |
| 165 | ; order = 1; |
| 166 | ; else if(total_error_2 < min(total_error_3, total_error_4)) |
| 167 | ; order = 2; |
| 168 | ; else if(total_error_3 < total_error_4) |
| 169 | ; order = 3; |
| 170 | ; else |
| 171 | ; order = 4; |
| 172 | movq mm3, mm0 ; mm3 = total_error_1:total_error_0 |
| 173 | movd edi, mm2 ; edi = total_error_4 |
| 174 | movd esi, mm1 ; esi = total_error_3 |
| 175 | movd eax, mm0 ; eax = total_error_0 |
| 176 | punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2 |
| 177 | punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1 |
| 178 | movd edx, mm1 ; edx = total_error_2 |
| 179 | movd ecx, mm3 ; ecx = total_error_1 |
| 180 | |
| 181 | xor ebx, ebx |
| 182 | xor ebp, ebp |
| 183 | inc ebx |
| 184 | cmp ecx, eax |
| 185 | cmovb eax, ecx ; eax = min(total_error_0, total_error_1) |
| 186 | cmovbe ebp, ebx |
| 187 | inc ebx |
| 188 | cmp edx, eax |
| 189 | cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) |
| 190 | cmovbe ebp, ebx |
| 191 | inc ebx |
| 192 | cmp esi, eax |
| 193 | cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) |
| 194 | cmovbe ebp, ebx |
| 195 | inc ebx |
| 196 | cmp edi, eax |
| 197 | cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) |
| 198 | cmovbe ebp, ebx |
| 199 | movd ebx, mm0 ; ebx = total_error_0 |
| 200 | emms |
| 201 | |
| 202 | ; residual_bits_per_sample[0] = (float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); |
| 203 | ; residual_bits_per_sample[1] = (float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); |
| 204 | ; residual_bits_per_sample[2] = (float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); |
| 205 | ; residual_bits_per_sample[3] = (float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); |
| 206 | ; residual_bits_per_sample[4] = (float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); |
| 207 | xor eax, eax |
| 208 | fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) |
| 209 | .rbps_0: |
| 210 | test ebx, ebx |
| 211 | jz .total_error_0_is_0 |
| 212 | fld1 ; ST = 1.0 data_len |
| 213 | mov [esp], ebx |
| 214 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0 |
| 215 | mov ebx, [esp + 44] |
| 216 | fild qword [esp] ; ST = total_error_0 1.0 data_len |
| 217 | fdiv st2 ; ST = total_error_0/data_len 1.0 data_len |
| 218 | fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len |
| 219 | fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len |
| 220 | fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len |
| 221 | fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len |
| 222 | jmp short .rbps_1 |
| 223 | .total_error_0_is_0: |
| 224 | mov ebx, [esp + 44] |
| 225 | mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 |
| 226 | .rbps_1: |
| 227 | test ecx, ecx |
| 228 | jz .total_error_1_is_0 |
| 229 | fld1 ; ST = 1.0 data_len |
| 230 | mov [esp], ecx |
| 231 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1 |
| 232 | fild qword [esp] ; ST = total_error_1 1.0 data_len |
| 233 | fdiv st2 ; ST = total_error_1/data_len 1.0 data_len |
| 234 | fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len |
| 235 | fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len |
| 236 | fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len |
| 237 | fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len |
| 238 | jmp short .rbps_2 |
| 239 | .total_error_1_is_0: |
| 240 | mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 |
| 241 | .rbps_2: |
| 242 | test edx, edx |
| 243 | jz .total_error_2_is_0 |
| 244 | fld1 ; ST = 1.0 data_len |
| 245 | mov [esp], edx |
| 246 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2 |
| 247 | fild qword [esp] ; ST = total_error_2 1.0 data_len |
| 248 | fdiv st2 ; ST = total_error_2/data_len 1.0 data_len |
| 249 | fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len |
| 250 | fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len |
| 251 | fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len |
| 252 | fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len |
| 253 | jmp short .rbps_3 |
| 254 | .total_error_2_is_0: |
| 255 | mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 |
| 256 | .rbps_3: |
| 257 | test esi, esi |
| 258 | jz .total_error_3_is_0 |
| 259 | fld1 ; ST = 1.0 data_len |
| 260 | mov [esp], esi |
| 261 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3 |
| 262 | fild qword [esp] ; ST = total_error_3 1.0 data_len |
| 263 | fdiv st2 ; ST = total_error_3/data_len 1.0 data_len |
| 264 | fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len |
| 265 | fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len |
| 266 | fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len |
| 267 | fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len |
| 268 | jmp short .rbps_4 |
| 269 | .total_error_3_is_0: |
| 270 | mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 |
| 271 | .rbps_4: |
| 272 | test edi, edi |
| 273 | jz .total_error_4_is_0 |
| 274 | fld1 ; ST = 1.0 data_len |
| 275 | mov [esp], edi |
| 276 | mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4 |
| 277 | fild qword [esp] ; ST = total_error_4 1.0 data_len |
| 278 | fdiv st2 ; ST = total_error_4/data_len 1.0 data_len |
| 279 | fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len |
| 280 | fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len |
| 281 | fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len |
| 282 | fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len |
| 283 | jmp short .rbps_end |
| 284 | .total_error_4_is_0: |
| 285 | mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 |
| 286 | .rbps_end: |
| 287 | fstp st0 ; ST = [empty] |
| 288 | jmp short .end |
| 289 | .data_len_is_0: |
| 290 | ; data_len == 0, so residual_bits_per_sample[*] = 0.0 |
| 291 | xor ebp, ebp |
| 292 | mov edi, [esp + 44] |
| 293 | mov [edi], ebp |
| 294 | mov [edi + 4], ebp |
| 295 | mov [edi + 8], ebp |
| 296 | mov [edi + 12], ebp |
| 297 | mov [edi + 16], ebp |
| 298 | add ebp, byte 4 ; order = 4 |
| 299 | |
| 300 | .end: |
| 301 | mov eax, ebp ; return order |
| 302 | add esp, byte 16 |
| 303 | pop edi |
| 304 | pop esi |
| 305 | pop ebx |
| 306 | pop ebp |
| 307 | ret |
| 308 | |
| 309 | ; end |