ce188d4d |
1 | /* libFLAC - Free Lossless Audio Codec library |
2 | * Copyright (C) 2000-2009 Josh Coalson |
3 | * Copyright (C) 2011-2016 Xiph.Org Foundation |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * |
9 | * - Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * |
12 | * - Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * - Neither the name of the Xiph.org Foundation nor the names of its |
17 | * contributors may be used to endorse or promote products derived from |
18 | * this software without specific prior written permission. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
23 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
24 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
25 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
27 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
28 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
29 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
30 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | |
33 | #ifdef HAVE_CONFIG_H |
34 | # include <config.h> |
35 | #endif |
36 | |
37 | #include "private/cpu.h" |
38 | |
39 | #ifndef FLAC__INTEGER_ONLY_LIBRARY |
40 | #ifndef FLAC__NO_ASM |
41 | #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN |
42 | #include "private/lpc.h" |
43 | #ifdef FLAC__SSE_SUPPORTED |
44 | #include "FLAC/assert.h" |
45 | #include "FLAC/format.h" |
46 | |
47 | #include <xmmintrin.h> /* SSE */ |
48 | |
49 | /* new routines: more unaligned loads, less shuffle |
50 | * old routines: less unaligned loads, more shuffle |
51 | * these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm |
52 | */ |
53 | |
54 | /* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */ |
55 | |
56 | FLAC__SSE_TARGET("sse") |
57 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
58 | { |
59 | int i; |
60 | int limit = data_len - 4; |
61 | __m128 sum0; |
62 | |
63 | (void) lag; |
64 | FLAC__ASSERT(lag <= 4); |
65 | FLAC__ASSERT(lag <= data_len); |
66 | |
67 | sum0 = _mm_setzero_ps(); |
68 | |
69 | for(i = 0; i <= limit; i++) { |
70 | __m128 d, d0; |
71 | d0 = _mm_loadu_ps(data+i); |
72 | d = d0; d = _mm_shuffle_ps(d, d, 0); |
73 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); |
74 | } |
75 | |
76 | { |
77 | __m128 d0 = _mm_setzero_ps(); |
78 | limit++; if(limit < 0) limit = 0; |
79 | |
80 | for(i = data_len-1; i >= limit; i--) { |
81 | __m128 d; |
82 | d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); |
83 | d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); |
84 | d0 = _mm_move_ss(d0, d); |
85 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); |
86 | } |
87 | } |
88 | |
89 | _mm_storeu_ps(autoc, sum0); |
90 | } |
91 | |
92 | FLAC__SSE_TARGET("sse") |
93 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
94 | { |
95 | int i; |
96 | int limit = data_len - 8; |
97 | __m128 sum0, sum1; |
98 | |
99 | (void) lag; |
100 | FLAC__ASSERT(lag <= 8); |
101 | FLAC__ASSERT(lag <= data_len); |
102 | |
103 | sum0 = _mm_setzero_ps(); |
104 | sum1 = _mm_setzero_ps(); |
105 | |
106 | for(i = 0; i <= limit; i++) { |
107 | __m128 d, d0, d1; |
108 | d0 = _mm_loadu_ps(data+i); |
109 | d1 = _mm_loadu_ps(data+i+4); |
110 | d = d0; d = _mm_shuffle_ps(d, d, 0); |
111 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); |
112 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); |
113 | } |
114 | |
115 | { |
116 | __m128 d0 = _mm_setzero_ps(); |
117 | __m128 d1 = _mm_setzero_ps(); |
118 | limit++; if(limit < 0) limit = 0; |
119 | |
120 | for(i = data_len-1; i >= limit; i--) { |
121 | __m128 d; |
122 | d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); |
123 | d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); |
124 | d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); |
125 | d1 = _mm_move_ss(d1, d0); |
126 | d0 = _mm_move_ss(d0, d); |
127 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); |
128 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); |
129 | } |
130 | } |
131 | |
132 | _mm_storeu_ps(autoc, sum0); |
133 | _mm_storeu_ps(autoc+4, sum1); |
134 | } |
135 | |
136 | FLAC__SSE_TARGET("sse") |
137 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
138 | { |
139 | int i; |
140 | int limit = data_len - 12; |
141 | __m128 sum0, sum1, sum2; |
142 | |
143 | (void) lag; |
144 | FLAC__ASSERT(lag <= 12); |
145 | FLAC__ASSERT(lag <= data_len); |
146 | |
147 | sum0 = _mm_setzero_ps(); |
148 | sum1 = _mm_setzero_ps(); |
149 | sum2 = _mm_setzero_ps(); |
150 | |
151 | for(i = 0; i <= limit; i++) { |
152 | __m128 d, d0, d1, d2; |
153 | d0 = _mm_loadu_ps(data+i); |
154 | d1 = _mm_loadu_ps(data+i+4); |
155 | d2 = _mm_loadu_ps(data+i+8); |
156 | d = d0; d = _mm_shuffle_ps(d, d, 0); |
157 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); |
158 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); |
159 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d)); |
160 | } |
161 | |
162 | { |
163 | __m128 d0 = _mm_setzero_ps(); |
164 | __m128 d1 = _mm_setzero_ps(); |
165 | __m128 d2 = _mm_setzero_ps(); |
166 | limit++; if(limit < 0) limit = 0; |
167 | |
168 | for(i = data_len-1; i >= limit; i--) { |
169 | __m128 d; |
170 | d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); |
171 | d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3)); |
172 | d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); |
173 | d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); |
174 | d2 = _mm_move_ss(d2, d1); |
175 | d1 = _mm_move_ss(d1, d0); |
176 | d0 = _mm_move_ss(d0, d); |
177 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2)); |
178 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); |
179 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); |
180 | } |
181 | } |
182 | |
183 | _mm_storeu_ps(autoc, sum0); |
184 | _mm_storeu_ps(autoc+4, sum1); |
185 | _mm_storeu_ps(autoc+8, sum2); |
186 | } |
187 | |
188 | FLAC__SSE_TARGET("sse") |
189 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
190 | { |
191 | int i; |
192 | int limit = data_len - 16; |
193 | __m128 sum0, sum1, sum2, sum3; |
194 | |
195 | (void) lag; |
196 | FLAC__ASSERT(lag <= 16); |
197 | FLAC__ASSERT(lag <= data_len); |
198 | |
199 | sum0 = _mm_setzero_ps(); |
200 | sum1 = _mm_setzero_ps(); |
201 | sum2 = _mm_setzero_ps(); |
202 | sum3 = _mm_setzero_ps(); |
203 | |
204 | for(i = 0; i <= limit; i++) { |
205 | __m128 d, d0, d1, d2, d3; |
206 | d0 = _mm_loadu_ps(data+i); |
207 | d1 = _mm_loadu_ps(data+i+4); |
208 | d2 = _mm_loadu_ps(data+i+8); |
209 | d3 = _mm_loadu_ps(data+i+12); |
210 | d = d0; d = _mm_shuffle_ps(d, d, 0); |
211 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); |
212 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); |
213 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d)); |
214 | sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d)); |
215 | } |
216 | |
217 | { |
218 | __m128 d0 = _mm_setzero_ps(); |
219 | __m128 d1 = _mm_setzero_ps(); |
220 | __m128 d2 = _mm_setzero_ps(); |
221 | __m128 d3 = _mm_setzero_ps(); |
222 | limit++; if(limit < 0) limit = 0; |
223 | |
224 | for(i = data_len-1; i >= limit; i--) { |
225 | __m128 d; |
226 | d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); |
227 | d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3)); |
228 | d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3)); |
229 | d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); |
230 | d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); |
231 | d3 = _mm_move_ss(d3, d2); |
232 | d2 = _mm_move_ss(d2, d1); |
233 | d1 = _mm_move_ss(d1, d0); |
234 | d0 = _mm_move_ss(d0, d); |
235 | sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3)); |
236 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2)); |
237 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); |
238 | sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); |
239 | } |
240 | } |
241 | |
242 | _mm_storeu_ps(autoc, sum0); |
243 | _mm_storeu_ps(autoc+4, sum1); |
244 | _mm_storeu_ps(autoc+8, sum2); |
245 | _mm_storeu_ps(autoc+12,sum3); |
246 | } |
247 | |
248 | /* old routines: faster on older Intel CPUs (up to Core 2) */ |
249 | |
250 | FLAC__SSE_TARGET("sse") |
251 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
252 | { |
253 | __m128 xmm0, xmm2, xmm5; |
254 | |
255 | (void) lag; |
256 | FLAC__ASSERT(lag > 0); |
257 | FLAC__ASSERT(lag <= 4); |
258 | FLAC__ASSERT(lag <= data_len); |
259 | FLAC__ASSERT(data_len > 0); |
260 | |
261 | xmm5 = _mm_setzero_ps(); |
262 | |
263 | xmm0 = _mm_load_ss(data++); |
264 | xmm2 = xmm0; |
265 | xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); |
266 | |
267 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
268 | xmm5 = _mm_add_ps(xmm5, xmm0); |
269 | |
270 | data_len--; |
271 | |
272 | while(data_len) |
273 | { |
274 | xmm0 = _mm_load1_ps(data++); |
275 | |
276 | xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); |
277 | xmm2 = _mm_move_ss(xmm2, xmm0); |
278 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
279 | xmm5 = _mm_add_ps(xmm5, xmm0); |
280 | |
281 | data_len--; |
282 | } |
283 | |
284 | _mm_storeu_ps(autoc, xmm5); |
285 | } |
286 | |
287 | FLAC__SSE_TARGET("sse") |
288 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
289 | { |
290 | __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6; |
291 | |
292 | (void) lag; |
293 | FLAC__ASSERT(lag > 0); |
294 | FLAC__ASSERT(lag <= 8); |
295 | FLAC__ASSERT(lag <= data_len); |
296 | FLAC__ASSERT(data_len > 0); |
297 | |
298 | xmm5 = _mm_setzero_ps(); |
299 | xmm6 = _mm_setzero_ps(); |
300 | |
301 | xmm0 = _mm_load_ss(data++); |
302 | xmm2 = xmm0; |
303 | xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); |
304 | xmm3 = _mm_setzero_ps(); |
305 | |
306 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
307 | xmm5 = _mm_add_ps(xmm5, xmm0); |
308 | |
309 | data_len--; |
310 | |
311 | while(data_len) |
312 | { |
313 | xmm0 = _mm_load1_ps(data++); |
314 | |
315 | xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); |
316 | xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3)); |
317 | xmm3 = _mm_move_ss(xmm3, xmm2); |
318 | xmm2 = _mm_move_ss(xmm2, xmm0); |
319 | |
320 | xmm1 = xmm0; |
321 | xmm1 = _mm_mul_ps(xmm1, xmm3); |
322 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
323 | xmm6 = _mm_add_ps(xmm6, xmm1); |
324 | xmm5 = _mm_add_ps(xmm5, xmm0); |
325 | |
326 | data_len--; |
327 | } |
328 | |
329 | _mm_storeu_ps(autoc, xmm5); |
330 | _mm_storeu_ps(autoc+4, xmm6); |
331 | } |
332 | |
333 | FLAC__SSE_TARGET("sse") |
334 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
335 | { |
336 | __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
337 | |
338 | (void) lag; |
339 | FLAC__ASSERT(lag > 0); |
340 | FLAC__ASSERT(lag <= 12); |
341 | FLAC__ASSERT(lag <= data_len); |
342 | FLAC__ASSERT(data_len > 0); |
343 | |
344 | xmm5 = _mm_setzero_ps(); |
345 | xmm6 = _mm_setzero_ps(); |
346 | xmm7 = _mm_setzero_ps(); |
347 | |
348 | xmm0 = _mm_load_ss(data++); |
349 | xmm2 = xmm0; |
350 | xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); |
351 | xmm3 = _mm_setzero_ps(); |
352 | xmm4 = _mm_setzero_ps(); |
353 | |
354 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
355 | xmm5 = _mm_add_ps(xmm5, xmm0); |
356 | |
357 | data_len--; |
358 | |
359 | while(data_len) |
360 | { |
361 | xmm0 = _mm_load1_ps(data++); |
362 | |
363 | xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); |
364 | xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3)); |
365 | xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3)); |
366 | xmm4 = _mm_move_ss(xmm4, xmm3); |
367 | xmm3 = _mm_move_ss(xmm3, xmm2); |
368 | xmm2 = _mm_move_ss(xmm2, xmm0); |
369 | |
370 | xmm1 = xmm0; |
371 | xmm1 = _mm_mul_ps(xmm1, xmm2); |
372 | xmm5 = _mm_add_ps(xmm5, xmm1); |
373 | xmm1 = xmm0; |
374 | xmm1 = _mm_mul_ps(xmm1, xmm3); |
375 | xmm6 = _mm_add_ps(xmm6, xmm1); |
376 | xmm0 = _mm_mul_ps(xmm0, xmm4); |
377 | xmm7 = _mm_add_ps(xmm7, xmm0); |
378 | |
379 | data_len--; |
380 | } |
381 | |
382 | _mm_storeu_ps(autoc, xmm5); |
383 | _mm_storeu_ps(autoc+4, xmm6); |
384 | _mm_storeu_ps(autoc+8, xmm7); |
385 | } |
386 | |
387 | FLAC__SSE_TARGET("sse") |
388 | void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) |
389 | { |
390 | __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9; |
391 | |
392 | (void) lag; |
393 | FLAC__ASSERT(lag > 0); |
394 | FLAC__ASSERT(lag <= 16); |
395 | FLAC__ASSERT(lag <= data_len); |
396 | FLAC__ASSERT(data_len > 0); |
397 | |
398 | xmm6 = _mm_setzero_ps(); |
399 | xmm7 = _mm_setzero_ps(); |
400 | xmm8 = _mm_setzero_ps(); |
401 | xmm9 = _mm_setzero_ps(); |
402 | |
403 | xmm0 = _mm_load_ss(data++); |
404 | xmm2 = xmm0; |
405 | xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); |
406 | xmm3 = _mm_setzero_ps(); |
407 | xmm4 = _mm_setzero_ps(); |
408 | xmm5 = _mm_setzero_ps(); |
409 | |
410 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
411 | xmm6 = _mm_add_ps(xmm6, xmm0); |
412 | |
413 | data_len--; |
414 | |
415 | while(data_len) |
416 | { |
417 | xmm0 = _mm_load1_ps(data++); |
418 | |
419 | /* shift xmm5:xmm4:xmm3:xmm2 left by one float */ |
420 | xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3)); |
421 | xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3)); |
422 | xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3)); |
423 | xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); |
424 | xmm5 = _mm_move_ss(xmm5, xmm4); |
425 | xmm4 = _mm_move_ss(xmm4, xmm3); |
426 | xmm3 = _mm_move_ss(xmm3, xmm2); |
427 | xmm2 = _mm_move_ss(xmm2, xmm0); |
428 | |
429 | /* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */ |
430 | xmm1 = xmm0; |
431 | xmm1 = _mm_mul_ps(xmm1, xmm5); |
432 | xmm9 = _mm_add_ps(xmm9, xmm1); |
433 | xmm1 = xmm0; |
434 | xmm1 = _mm_mul_ps(xmm1, xmm4); |
435 | xmm8 = _mm_add_ps(xmm8, xmm1); |
436 | xmm1 = xmm0; |
437 | xmm1 = _mm_mul_ps(xmm1, xmm3); |
438 | xmm7 = _mm_add_ps(xmm7, xmm1); |
439 | xmm0 = _mm_mul_ps(xmm0, xmm2); |
440 | xmm6 = _mm_add_ps(xmm6, xmm0); |
441 | |
442 | data_len--; |
443 | } |
444 | |
445 | _mm_storeu_ps(autoc, xmm6); |
446 | _mm_storeu_ps(autoc+4, xmm7); |
447 | _mm_storeu_ps(autoc+8, xmm8); |
448 | _mm_storeu_ps(autoc+12,xmm9); |
449 | } |
450 | |
451 | #endif /* FLAC__SSE_SUPPORTED */ |
452 | #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
453 | #endif /* FLAC__NO_ASM */ |
454 | #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |