lightrec: implement clock cache clear on cycle_multiplier change
[pcsx_rearmed.git] / deps / libretro-common / audio / resampler / drivers / sinc_resampler.c
CommitLineData
3719602c
PC
1/* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (sinc_resampler.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* Bog-standard windowed SINC implementation. */
24
25#include <stdint.h>
26#include <stdlib.h>
27#include <math.h>
28#include <string.h>
29
30#include <retro_environment.h>
31#include <retro_inline.h>
32#include <filters.h>
33#include <memalign.h>
34
35#include <audio/audio_resampler.h>
36#include <filters.h>
37
38#ifdef __SSE__
39#include <xmmintrin.h>
40#endif
41
42#if defined(__AVX__)
43#include <immintrin.h>
44#endif
45
46/* Rough SNR values for upsampling:
47 * LOWEST: 40 dB
48 * LOWER: 55 dB
49 * NORMAL: 70 dB
50 * HIGHER: 110 dB
51 * HIGHEST: 140 dB
52 */
53
54/* TODO, make all this more configurable. */
55
56enum sinc_window
57{
58 SINC_WINDOW_NONE = 0,
59 SINC_WINDOW_KAISER,
60 SINC_WINDOW_LANCZOS
61};
62
63/* For the little amount of taps we're using,
64 * SSE1 is faster than AVX for some reason.
65 * AVX code is kept here though as by increasing number
66 * of sinc taps, the AVX code is clearly faster than SSE1.
67 */
68
69typedef struct rarch_sinc_resampler
70{
71 /* A buffer for phase_table, buffer_l and buffer_r
72 * are created in a single calloc().
73 * Ensure that we get as good cache locality as we can hope for. */
74 float *main_buffer;
75 float *phase_table;
76 float *buffer_l;
77 float *buffer_r;
78 unsigned phase_bits;
79 unsigned subphase_bits;
80 unsigned subphase_mask;
81 unsigned taps;
82 unsigned ptr;
83 uint32_t time;
84 float subphase_mod;
85 float kaiser_beta;
86} rarch_sinc_resampler_t;
87
88#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
89
90#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
91void process_sinc_neon_asm(float *out, const float *left,
92 const float *right, const float *coeff, unsigned taps);
93#else
94#include <arm_neon.h>
95
96/* Assumes that taps >= 8, and that taps is a multiple of 8.
97 * Not bothering to reimplement this one for the external .S
98 */
99static void resampler_sinc_process_neon_kaiser(void *re_, struct resampler_data *data)
100{
101 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
102 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
103 uint32_t ratio = phases / data->ratio;
104 const float *input = data->data_in;
105 float *output = data->data_out;
106 size_t frames = data->input_frames;
107 size_t out_frames = 0;
108 unsigned taps = resamp->taps;
109 while (frames)
110 {
111 while (frames && resamp->time >= phases)
112 {
113 /* Push in reverse to make filter more obvious. */
114 if (!resamp->ptr)
115 resamp->ptr = taps;
116 resamp->ptr--;
117
118 resamp->buffer_l[resamp->ptr + taps] =
119 resamp->buffer_l[resamp->ptr] = *input++;
120
121 resamp->buffer_r[resamp->ptr + taps] =
122 resamp->buffer_r[resamp->ptr] = *input++;
123
124 resamp->time -= phases;
125 frames--;
126 }
127
128 {
129 const float *buffer_l = resamp->buffer_l + resamp->ptr;
130 const float *buffer_r = resamp->buffer_r + resamp->ptr;
131 while (resamp->time < phases)
132 {
133 unsigned phase = resamp->time >> resamp->subphase_bits;
134 const float *phase_table = resamp->phase_table + phase * taps * 2;
135 const float *delta_table = phase_table + taps;
136 float32x4_t delta = vdupq_n_f32((resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
137 int i;
138 float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
139 float32x2_t p3, p4;
140
141 for (i = 0; i < (int)taps; i += 8)
142 {
143 float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]);
144 float32x4x2_t delta8 = vld2q_f32(&delta_table[i]);
145 float32x4x2_t left8 = vld2q_f32(&buffer_l[i]);
146 float32x4x2_t right8 = vld2q_f32(&buffer_r[i]);
147
148 coeff8.val[0] = vmlaq_f32(coeff8.val[0], delta8.val[0], delta);
149 coeff8.val[1] = vmlaq_f32(coeff8.val[1], delta8.val[1], delta);
150
151 p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]);
152 p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
153 p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]);
154 p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
155 }
156
157 p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
158 p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
159 vst1_f32(output, vpadd_f32(p3, p4));
160 output += 2;
161 out_frames++;
162 resamp->time += ratio;
163 }
164 }
165 }
166
167 data->output_frames = out_frames;
168}
169#endif
170
171/* Assumes that taps >= 8, and that taps is a multiple of 8. */
172static void resampler_sinc_process_neon(void *re_, struct resampler_data *data)
173{
174 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
175 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
176 uint32_t ratio = phases / data->ratio;
177 const float *input = data->data_in;
178 float *output = data->data_out;
179 size_t frames = data->input_frames;
180 size_t out_frames = 0;
181 unsigned taps = resamp->taps;
182
183 while (frames)
184 {
185 while (frames && resamp->time >= phases)
186 {
187 /* Push in reverse to make filter more obvious. */
188 if (!resamp->ptr)
189 resamp->ptr = taps;
190 resamp->ptr--;
191
192 resamp->buffer_l[resamp->ptr + taps] =
193 resamp->buffer_l[resamp->ptr] = *input++;
194
195 resamp->buffer_r[resamp->ptr + taps] =
196 resamp->buffer_r[resamp->ptr] = *input++;
197
198 resamp->time -= phases;
199 frames--;
200 }
201
202 {
203 const float *buffer_l = resamp->buffer_l + resamp->ptr;
204 const float *buffer_r = resamp->buffer_r + resamp->ptr;
205 while (resamp->time < phases)
206 {
207 unsigned phase = resamp->time >> resamp->subphase_bits;
208 const float *phase_table = resamp->phase_table + phase * taps;
209#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
210 process_sinc_neon_asm(output, buffer_l, buffer_r, phase_table, taps);
211#else
212 int i;
213 float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
214 float32x2_t p3, p4;
215
216 for (i = 0; i < (int)taps; i += 8)
217 {
218 float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]);
219 float32x4x2_t left8 = vld2q_f32(&buffer_l[i]);
220 float32x4x2_t right8 = vld2q_f32(&buffer_r[i]);
221
222 p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]);
223 p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
224 p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]);
225 p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
226 }
227
228 p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
229 p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
230 vst1_f32(output, vpadd_f32(p3, p4));
231#endif
232 output += 2;
233 out_frames++;
234 resamp->time += ratio;
235 }
236 }
237 }
238
239 data->output_frames = out_frames;
240}
241#endif
242
243#if defined(__AVX__)
244static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data)
245{
246 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
247 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
248
249 uint32_t ratio = phases / data->ratio;
250 const float *input = data->data_in;
251 float *output = data->data_out;
252 size_t frames = data->input_frames;
253 size_t out_frames = 0;
254 unsigned taps = resamp->taps;
255
256 {
257 while (frames)
258 {
259 while (frames && resamp->time >= phases)
260 {
261 /* Push in reverse to make filter more obvious. */
262 if (!resamp->ptr)
263 resamp->ptr = taps;
264 resamp->ptr--;
265
266 resamp->buffer_l[resamp->ptr + taps] =
267 resamp->buffer_l[resamp->ptr] = *input++;
268
269 resamp->buffer_r[resamp->ptr + taps] =
270 resamp->buffer_r[resamp->ptr] = *input++;
271
272 resamp->time -= phases;
273 frames--;
274 }
275
276 {
277 const float *buffer_l = resamp->buffer_l + resamp->ptr;
278 const float *buffer_r = resamp->buffer_r + resamp->ptr;
279 while (resamp->time < phases)
280 {
281 int i;
282 unsigned phase = resamp->time >> resamp->subphase_bits;
283
284 float *phase_table = resamp->phase_table + phase * taps * 2;
285 float *delta_table = phase_table + taps;
286 __m256 delta = _mm256_set1_ps((float)
287 (resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
288
289 __m256 sum_l = _mm256_setzero_ps();
290 __m256 sum_r = _mm256_setzero_ps();
291
292 for (i = 0; i < (int)taps; i += 8)
293 {
294 __m256 buf_l = _mm256_loadu_ps(buffer_l + i);
295 __m256 buf_r = _mm256_loadu_ps(buffer_r + i);
296 __m256 deltas = _mm256_load_ps(delta_table + i);
297 __m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i),
298 _mm256_mul_ps(deltas, delta));
299
300 sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
301 sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
302 }
303
304 /* hadd on AVX is weird, and acts on low-lanes
305 * and high-lanes separately. */
306 __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
307 __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
308 res_l = _mm256_hadd_ps(res_l, res_l);
309 res_r = _mm256_hadd_ps(res_r, res_r);
310 res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
311 res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
312
313 /* This is optimized to mov %xmmN, [mem].
314 * There doesn't seem to be any _mm256_store_ss intrinsic. */
315 _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
316 _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
317
318 output += 2;
319 out_frames++;
320 resamp->time += ratio;
321 }
322 }
323 }
324 }
325
326 data->output_frames = out_frames;
327}
328
329static void resampler_sinc_process_avx(void *re_, struct resampler_data *data)
330{
331 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
332 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
333
334 uint32_t ratio = phases / data->ratio;
335 const float *input = data->data_in;
336 float *output = data->data_out;
337 size_t frames = data->input_frames;
338 size_t out_frames = 0;
339 unsigned taps = resamp->taps;
340
341 {
342 while (frames)
343 {
344 while (frames && resamp->time >= phases)
345 {
346 /* Push in reverse to make filter more obvious. */
347 if (!resamp->ptr)
348 resamp->ptr = taps;
349 resamp->ptr--;
350
351 resamp->buffer_l[resamp->ptr + taps] =
352 resamp->buffer_l[resamp->ptr] = *input++;
353
354 resamp->buffer_r[resamp->ptr + taps] =
355 resamp->buffer_r[resamp->ptr] = *input++;
356
357 resamp->time -= phases;
358 frames--;
359 }
360
361 {
362 const float *buffer_l = resamp->buffer_l + resamp->ptr;
363 const float *buffer_r = resamp->buffer_r + resamp->ptr;
364 while (resamp->time < phases)
365 {
366 int i;
367 __m256 delta;
368 unsigned phase = resamp->time >> resamp->subphase_bits;
369 float *phase_table = resamp->phase_table + phase * taps;
370
371 __m256 sum_l = _mm256_setzero_ps();
372 __m256 sum_r = _mm256_setzero_ps();
373
374 for (i = 0; i < (int)taps; i += 8)
375 {
376 __m256 buf_l = _mm256_loadu_ps(buffer_l + i);
377 __m256 buf_r = _mm256_loadu_ps(buffer_r + i);
378 __m256 sinc = _mm256_load_ps((const float*)phase_table + i);
379
380 sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
381 sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
382 }
383
384 /* hadd on AVX is weird, and acts on low-lanes
385 * and high-lanes separately. */
386 __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
387 __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
388 res_l = _mm256_hadd_ps(res_l, res_l);
389 res_r = _mm256_hadd_ps(res_r, res_r);
390 res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
391 res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
392
393 /* This is optimized to mov %xmmN, [mem].
394 * There doesn't seem to be any _mm256_store_ss intrinsic. */
395 _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
396 _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
397
398 output += 2;
399 out_frames++;
400 resamp->time += ratio;
401 }
402 }
403 }
404 }
405
406 data->output_frames = out_frames;
407}
408#endif
409
410#if defined(__SSE__)
411static void resampler_sinc_process_sse_kaiser(void *re_, struct resampler_data *data)
412{
413 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
414 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
415
416 uint32_t ratio = phases / data->ratio;
417 const float *input = data->data_in;
418 float *output = data->data_out;
419 size_t frames = data->input_frames;
420 size_t out_frames = 0;
421 unsigned taps = resamp->taps;
422
423 {
424 while (frames)
425 {
426 while (frames && resamp->time >= phases)
427 {
428 /* Push in reverse to make filter more obvious. */
429 if (!resamp->ptr)
430 resamp->ptr = taps;
431 resamp->ptr--;
432
433 resamp->buffer_l[resamp->ptr + taps] =
434 resamp->buffer_l[resamp->ptr] = *input++;
435
436 resamp->buffer_r[resamp->ptr + taps] =
437 resamp->buffer_r[resamp->ptr] = *input++;
438
439 resamp->time -= phases;
440 frames--;
441 }
442
443 {
444 const float *buffer_l = resamp->buffer_l + resamp->ptr;
445 const float *buffer_r = resamp->buffer_r + resamp->ptr;
446 while (resamp->time < phases)
447 {
448 int i;
449 __m128 sum;
450 unsigned phase = resamp->time >> resamp->subphase_bits;
451 float *phase_table = resamp->phase_table + phase * taps * 2;
452 float *delta_table = phase_table + taps;
453 __m128 delta = _mm_set1_ps((float)
454 (resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
455
456 __m128 sum_l = _mm_setzero_ps();
457 __m128 sum_r = _mm_setzero_ps();
458
459 for (i = 0; i < (int)taps; i += 4)
460 {
461 __m128 buf_l = _mm_loadu_ps(buffer_l + i);
462 __m128 buf_r = _mm_loadu_ps(buffer_r + i);
463 __m128 deltas = _mm_load_ps(delta_table + i);
464 __m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i),
465 _mm_mul_ps(deltas, delta));
466 sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
467 sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
468 }
469
470 /* Them annoying shuffles.
471 * sum_l = { l3, l2, l1, l0 }
472 * sum_r = { r3, r2, r1, r0 }
473 */
474
475 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
476 _MM_SHUFFLE(1, 0, 1, 0)),
477 _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
478
479 /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
480 * sum = { R1, R0, L1, L0 }
481 */
482
483 sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
484
485 /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
486 * sum = { X, R, X, L }
487 */
488
489 /* Store L */
490 _mm_store_ss(output + 0, sum);
491
492 /* movehl { X, R, X, L } == { X, R, X, R } */
493 _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
494
495 output += 2;
496 out_frames++;
497 resamp->time += ratio;
498 }
499 }
500 }
501 }
502
503 data->output_frames = out_frames;
504}
505
506static void resampler_sinc_process_sse(void *re_, struct resampler_data *data)
507{
508 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
509 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
510
511 uint32_t ratio = phases / data->ratio;
512 const float *input = data->data_in;
513 float *output = data->data_out;
514 size_t frames = data->input_frames;
515 size_t out_frames = 0;
516 unsigned taps = resamp->taps;
517
518 {
519 while (frames)
520 {
521 while (frames && resamp->time >= phases)
522 {
523 /* Push in reverse to make filter more obvious. */
524 if (!resamp->ptr)
525 resamp->ptr = taps;
526 resamp->ptr--;
527
528 resamp->buffer_l[resamp->ptr + taps] =
529 resamp->buffer_l[resamp->ptr] = *input++;
530
531 resamp->buffer_r[resamp->ptr + taps] =
532 resamp->buffer_r[resamp->ptr] = *input++;
533
534 resamp->time -= phases;
535 frames--;
536 }
537
538 {
539 const float *buffer_l = resamp->buffer_l + resamp->ptr;
540 const float *buffer_r = resamp->buffer_r + resamp->ptr;
541 while (resamp->time < phases)
542 {
543 int i;
544 __m128 sum;
545 unsigned phase = resamp->time >> resamp->subphase_bits;
546 float *phase_table = resamp->phase_table + phase * taps;
547
548 __m128 sum_l = _mm_setzero_ps();
549 __m128 sum_r = _mm_setzero_ps();
550
551 for (i = 0; i < (int)taps; i += 4)
552 {
553 __m128 buf_l = _mm_loadu_ps(buffer_l + i);
554 __m128 buf_r = _mm_loadu_ps(buffer_r + i);
555 __m128 _sinc = _mm_load_ps((const float*)phase_table + i);
556 sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
557 sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
558 }
559
560 /* Them annoying shuffles.
561 * sum_l = { l3, l2, l1, l0 }
562 * sum_r = { r3, r2, r1, r0 }
563 */
564
565 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
566 _MM_SHUFFLE(1, 0, 1, 0)),
567 _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
568
569 /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
570 * sum = { R1, R0, L1, L0 }
571 */
572
573 sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
574
575 /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
576 * sum = { X, R, X, L }
577 */
578
579 /* Store L */
580 _mm_store_ss(output + 0, sum);
581
582 /* movehl { X, R, X, L } == { X, R, X, R } */
583 _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
584
585 output += 2;
586 out_frames++;
587 resamp->time += ratio;
588 }
589 }
590 }
591 }
592
593 data->output_frames = out_frames;
594}
595#endif
596
597static void resampler_sinc_process_c_kaiser(void *re_, struct resampler_data *data)
598{
599 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
600 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
601
602 uint32_t ratio = phases / data->ratio;
603 const float *input = data->data_in;
604 float *output = data->data_out;
605 size_t frames = data->input_frames;
606 size_t out_frames = 0;
607 unsigned taps = resamp->taps;
608
609 {
610 while (frames)
611 {
612 while (frames && resamp->time >= phases)
613 {
614 /* Push in reverse to make filter more obvious. */
615 if (!resamp->ptr)
616 resamp->ptr = taps;
617 resamp->ptr--;
618
619 resamp->buffer_l[resamp->ptr + taps] =
620 resamp->buffer_l[resamp->ptr] = *input++;
621
622 resamp->buffer_r[resamp->ptr + taps] =
623 resamp->buffer_r[resamp->ptr] = *input++;
624
625 resamp->time -= phases;
626 frames--;
627 }
628
629 {
630 const float *buffer_l = resamp->buffer_l + resamp->ptr;
631 const float *buffer_r = resamp->buffer_r + resamp->ptr;
632 while (resamp->time < phases)
633 {
634 int i;
635 float sum_l = 0.0f;
636 float sum_r = 0.0f;
637 unsigned phase = resamp->time >> resamp->subphase_bits;
638 float *phase_table = resamp->phase_table + phase * taps * 2;
639 float *delta_table = phase_table + taps;
640 float delta = (float)
641 (resamp->time & resamp->subphase_mask) * resamp->subphase_mod;
642
643 for (i = 0; i < (int)taps; i++)
644 {
645 float sinc_val = phase_table[i] + delta_table[i] * delta;
646
647 sum_l += buffer_l[i] * sinc_val;
648 sum_r += buffer_r[i] * sinc_val;
649 }
650
651 output[0] = sum_l;
652 output[1] = sum_r;
653
654 output += 2;
655 out_frames++;
656 resamp->time += ratio;
657 }
658 }
659
660 }
661 }
662
663 data->output_frames = out_frames;
664}
665
666static void resampler_sinc_process_c(void *re_, struct resampler_data *data)
667{
668 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
669 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
670
671 uint32_t ratio = phases / data->ratio;
672 const float *input = data->data_in;
673 float *output = data->data_out;
674 size_t frames = data->input_frames;
675 size_t out_frames = 0;
676 unsigned taps = resamp->taps;
677
678 {
679 while (frames)
680 {
681 while (frames && resamp->time >= phases)
682 {
683 /* Push in reverse to make filter more obvious. */
684 if (!resamp->ptr)
685 resamp->ptr = taps;
686 resamp->ptr--;
687
688 resamp->buffer_l[resamp->ptr + taps] =
689 resamp->buffer_l[resamp->ptr] = *input++;
690
691 resamp->buffer_r[resamp->ptr + taps] =
692 resamp->buffer_r[resamp->ptr] = *input++;
693
694 resamp->time -= phases;
695 frames--;
696 }
697
698 {
699 const float *buffer_l = resamp->buffer_l + resamp->ptr;
700 const float *buffer_r = resamp->buffer_r + resamp->ptr;
701 while (resamp->time < phases)
702 {
703 int i;
704 float sum_l = 0.0f;
705 float sum_r = 0.0f;
706 unsigned phase = resamp->time >> resamp->subphase_bits;
707 float *phase_table = resamp->phase_table + phase * taps;
708
709 for (i = 0; i < (int)taps; i++)
710 {
711 float sinc_val = phase_table[i];
712
713 sum_l += buffer_l[i] * sinc_val;
714 sum_r += buffer_r[i] * sinc_val;
715 }
716
717 output[0] = sum_l;
718 output[1] = sum_r;
719
720 output += 2;
721 out_frames++;
722 resamp->time += ratio;
723 }
724 }
725
726 }
727 }
728
729 data->output_frames = out_frames;
730}
731
732static void resampler_sinc_free(void *data)
733{
734 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)data;
735 if (resamp)
736 memalign_free(resamp->main_buffer);
737 free(resamp);
738}
739
740static void sinc_init_table_kaiser(rarch_sinc_resampler_t *resamp,
741 double cutoff,
742 float *phase_table, int phases, int taps, bool calculate_delta)
743{
744 int i, j;
745 /* Kaiser window function - need to normalize w(0) to 1.0f */
746 float kaiser_beta = resamp->kaiser_beta;
747 double window_mod = besseli0(kaiser_beta);
748 int stride = calculate_delta ? 2 : 1;
749 double sidelobes = taps / 2.0;
750
751 for (i = 0; i < phases; i++)
752 {
753 for (j = 0; j < taps; j++)
754 {
755 float val;
756 double sinc_phase;
757 int n = j * phases + i;
758 double window_phase = (double)n / (phases * taps); /* [0, 1). */
759 window_phase = 2.0 * window_phase - 1.0; /* [-1, 1) */
760 sinc_phase = sidelobes * window_phase;
761 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
762 besseli0(kaiser_beta * sqrtf(1 - window_phase * window_phase))
763 / window_mod;
764 phase_table[i * stride * taps + j] = val;
765 }
766 }
767
768 if (calculate_delta)
769 {
770 int phase;
771 int p;
772
773 for (p = 0; p < phases - 1; p++)
774 {
775 for (j = 0; j < taps; j++)
776 {
777 float delta = phase_table[(p + 1) * stride * taps + j] -
778 phase_table[p * stride * taps + j];
779 phase_table[(p * stride + 1) * taps + j] = delta;
780 }
781 }
782
783 phase = phases - 1;
784 for (j = 0; j < taps; j++)
785 {
786 float val, delta;
787 double sinc_phase;
788 int n = j * phases + (phase + 1);
789 double window_phase = (double)n / (phases * taps); /* (0, 1]. */
790 window_phase = 2.0 * window_phase - 1.0; /* (-1, 1] */
791 sinc_phase = sidelobes * window_phase;
792
793 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
794 besseli0(resamp->kaiser_beta * sqrtf(1 - window_phase *
795window_phase)) / window_mod;
796 delta = (val - phase_table[phase * stride * taps + j]);
797 phase_table[(phase * stride + 1) * taps + j] = delta;
798 }
799 }
800}
801
802static void sinc_init_table_lanczos(
803 rarch_sinc_resampler_t *resamp, double cutoff,
804 float *phase_table, int phases, int taps, bool calculate_delta)
805{
806 int i, j;
807 /* Lanczos window function - need to normalize w(0) to 1.0f */
808 double window_mod = 1.0;
809 int stride = calculate_delta ? 2 : 1;
810 double sidelobes = taps / 2.0;
811
812 for (i = 0; i < phases; i++)
813 {
814 for (j = 0; j < taps; j++)
815 {
816 double sinc_phase;
817 float val;
818 int n = j * phases + i;
819 double window_phase = (double)n / (phases * taps); /* [0, 1). */
820 window_phase = 2.0 * window_phase - 1.0; /* [-1, 1) */
821 sinc_phase = sidelobes * window_phase;
822 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
823 sinc(M_PI * window_phase) / window_mod;
824 phase_table[i * stride * taps + j] = val;
825 }
826 }
827
828 if (calculate_delta)
829 {
830 int phase;
831 int p;
832
833 for (p = 0; p < phases - 1; p++)
834 {
835 for (j = 0; j < taps; j++)
836 {
837 float delta = phase_table[(p + 1) * stride * taps + j] -
838 phase_table[p * stride * taps + j];
839 phase_table[(p * stride + 1) * taps + j] = delta;
840 }
841 }
842
843 phase = phases - 1;
844 for (j = 0; j < taps; j++)
845 {
846 float val, delta;
847 double sinc_phase;
848 int n = j * phases + (phase + 1);
849 double window_phase = (double)n / (phases * taps); /* (0, 1]. */
850 window_phase = 2.0 * window_phase - 1.0; /* (-1, 1] */
851 sinc_phase = sidelobes * window_phase;
852
853 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
854 sinc(M_PI * window_phase) / window_mod;
855 delta = (val - phase_table[phase * stride * taps + j]);
856 phase_table[(phase * stride + 1) * taps + j] = delta;
857 }
858 }
859}
860
861static void *resampler_sinc_new(const struct resampler_config *config,
862 double bandwidth_mod, enum resampler_quality quality,
863 resampler_simd_mask_t mask)
864{
865 double cutoff = 0.0;
866 size_t phase_elems = 0;
867 size_t elems = 0;
868 unsigned enable_avx = 0;
869 unsigned sidelobes = 0;
870 enum sinc_window window_type = SINC_WINDOW_NONE;
871 rarch_sinc_resampler_t *re = (rarch_sinc_resampler_t*)
872 calloc(1, sizeof(*re));
873
874 if (!re)
875 return NULL;
876
877 switch (quality)
878 {
879 case RESAMPLER_QUALITY_LOWEST:
880 cutoff = 0.98;
881 sidelobes = 2;
882 re->phase_bits = 12;
883 re->subphase_bits = 10;
884 window_type = SINC_WINDOW_LANCZOS;
885 break;
886 case RESAMPLER_QUALITY_LOWER:
887 cutoff = 0.98;
888 sidelobes = 4;
889 re->phase_bits = 12;
890 re->subphase_bits = 10;
891 window_type = SINC_WINDOW_LANCZOS;
892 break;
893 case RESAMPLER_QUALITY_HIGHER:
894 cutoff = 0.90;
895 sidelobes = 32;
896 re->phase_bits = 10;
897 re->subphase_bits = 14;
898 window_type = SINC_WINDOW_KAISER;
899 re->kaiser_beta = 10.5;
900 enable_avx = 1;
901 break;
902 case RESAMPLER_QUALITY_HIGHEST:
903 cutoff = 0.962;
904 sidelobes = 128;
905 re->phase_bits = 10;
906 re->subphase_bits = 14;
907 window_type = SINC_WINDOW_KAISER;
908 re->kaiser_beta = 14.5;
909 enable_avx = 1;
910 break;
911 case RESAMPLER_QUALITY_NORMAL:
912 case RESAMPLER_QUALITY_DONTCARE:
913 cutoff = 0.825;
914 sidelobes = 8;
915 re->phase_bits = 8;
916 re->subphase_bits = 16;
917 window_type = SINC_WINDOW_KAISER;
918 re->kaiser_beta = 5.5;
919 break;
920 }
921
922 re->subphase_mask = (1 << re->subphase_bits) - 1;
923 re->subphase_mod = 1.0f / (1 << re->subphase_bits);
924 re->taps = sidelobes * 2;
925
926 /* Downsampling, must lower cutoff, and extend number of
927 * taps accordingly to keep same stopband attenuation. */
928 if (bandwidth_mod < 1.0)
929 {
930 cutoff *= bandwidth_mod;
931 re->taps = (unsigned)ceil(re->taps / bandwidth_mod);
932 }
933
934 /* Be SIMD-friendly. */
935#if defined(__AVX__)
936 if (enable_avx)
937 re->taps = (re->taps + 7) & ~7;
938 else
939#endif
940 {
941#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
942 re->taps = (re->taps + 7) & ~7;
943#else
944 re->taps = (re->taps + 3) & ~3;
945#endif
946 }
947
948 phase_elems = ((1 << re->phase_bits) * re->taps);
949 if (window_type == SINC_WINDOW_KAISER)
950 phase_elems = phase_elems * 2;
951 elems = phase_elems + 4 * re->taps;
952
953 re->main_buffer = (float*)memalign_alloc(128, sizeof(float) * elems);
954 if (!re->main_buffer)
955 goto error;
956
957 memset(re->main_buffer, 0, sizeof(float) * elems);
958
959 re->phase_table = re->main_buffer;
960 re->buffer_l = re->main_buffer + phase_elems;
961 re->buffer_r = re->buffer_l + 2 * re->taps;
962
963 switch (window_type)
964 {
965 case SINC_WINDOW_LANCZOS:
966 sinc_init_table_lanczos(re, cutoff, re->phase_table,
967 1 << re->phase_bits, re->taps, false);
968 break;
969 case SINC_WINDOW_KAISER:
970 sinc_init_table_kaiser(re, cutoff, re->phase_table,
971 1 << re->phase_bits, re->taps, true);
972 break;
973 case SINC_WINDOW_NONE:
974 goto error;
975 }
976
977 sinc_resampler.process = resampler_sinc_process_c;
978 if (window_type == SINC_WINDOW_KAISER)
979 sinc_resampler.process = resampler_sinc_process_c_kaiser;
980
981 if (mask & RESAMPLER_SIMD_AVX && enable_avx)
982 {
983#if defined(__AVX__)
984 sinc_resampler.process = resampler_sinc_process_avx;
985 if (window_type == SINC_WINDOW_KAISER)
986 sinc_resampler.process = resampler_sinc_process_avx_kaiser;
987#endif
988 }
989 else if (mask & RESAMPLER_SIMD_SSE)
990 {
991#if defined(__SSE__)
992 sinc_resampler.process = resampler_sinc_process_sse;
993 if (window_type == SINC_WINDOW_KAISER)
994 sinc_resampler.process = resampler_sinc_process_sse_kaiser;
995#endif
996 }
997 else if (mask & RESAMPLER_SIMD_NEON)
998 {
999#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
1000#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
1001 if (window_type != SINC_WINDOW_KAISER)
1002 sinc_resampler.process = resampler_sinc_process_neon;
1003#else
1004 sinc_resampler.process = resampler_sinc_process_neon;
1005 if (window_type == SINC_WINDOW_KAISER)
1006 sinc_resampler.process = resampler_sinc_process_neon_kaiser;
1007#endif
1008#endif
1009 }
1010
1011 return re;
1012
1013error:
1014 resampler_sinc_free(re);
1015 return NULL;
1016}
1017
1018retro_resampler_t sinc_resampler = {
1019 resampler_sinc_new,
1020 resampler_sinc_process_c,
1021 resampler_sinc_free,
1022 RESAMPLER_API_VERSION,
1023 "sinc",
1024 "sinc"
1025};