lightrec: implement clock cache clear on cycle_multiplier change
[pcsx_rearmed.git] / deps / libretro-common / audio / resampler / drivers / sinc_resampler.c
1 /* Copyright  (C) 2010-2020 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (sinc_resampler.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22
23 /* Bog-standard windowed SINC implementation. */
24
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <math.h>
28 #include <string.h>
29
30 #include <retro_environment.h>
31 #include <retro_inline.h>
32 #include <filters.h>
33 #include <memalign.h>
34
35 #include <audio/audio_resampler.h>
36 #include <filters.h>
37
38 #ifdef __SSE__
39 #include <xmmintrin.h>
40 #endif
41
42 #if defined(__AVX__)
43 #include <immintrin.h>
44 #endif
45
46 /* Rough SNR values for upsampling:
47  * LOWEST: 40 dB
48  * LOWER: 55 dB
49  * NORMAL: 70 dB
50  * HIGHER: 110 dB
51  * HIGHEST: 140 dB
52  */
53
54 /* TODO, make all this more configurable. */
55
56 enum sinc_window
57 {
58    SINC_WINDOW_NONE   = 0,
59    SINC_WINDOW_KAISER,
60    SINC_WINDOW_LANCZOS
61 };
62
63 /* For the little amount of taps we're using,
64  * SSE1 is faster than AVX for some reason.
65  * AVX code is kept here though as by increasing number
66  * of sinc taps, the AVX code is clearly faster than SSE1.
67  */
68
69 typedef struct rarch_sinc_resampler
70 {
71    /* A buffer for phase_table, buffer_l and buffer_r
72     * are created in a single calloc().
73     * Ensure that we get as good cache locality as we can hope for. */
74    float *main_buffer;
75    float *phase_table;
76    float *buffer_l;
77    float *buffer_r;
78    unsigned phase_bits;
79    unsigned subphase_bits;
80    unsigned subphase_mask;
81    unsigned taps;
82    unsigned ptr;
83    uint32_t time;
84    float subphase_mod;
85    float kaiser_beta;
86 } rarch_sinc_resampler_t;
87
88 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
89
90 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
91 void process_sinc_neon_asm(float *out, const float *left,
92       const float *right, const float *coeff, unsigned taps);
93 #else
94 #include <arm_neon.h>
95
96 /* Assumes that taps >= 8, and that taps is a multiple of 8.
97  * Not bothering to reimplement this one for the external .S
98  */
99 static void resampler_sinc_process_neon_kaiser(void *re_, struct resampler_data *data)
100 {
101    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
102    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
103    uint32_t ratio                 = phases / data->ratio;
104    const float *input             = data->data_in;
105    float *output                  = data->data_out;
106    size_t frames                  = data->input_frames;
107    size_t out_frames              = 0;
108    unsigned taps                  = resamp->taps;
109    while (frames)
110    {
111       while (frames && resamp->time >= phases)
112       {
113          /* Push in reverse to make filter more obvious. */
114          if (!resamp->ptr)
115             resamp->ptr = taps;
116          resamp->ptr--;
117
118          resamp->buffer_l[resamp->ptr + taps] =
119             resamp->buffer_l[resamp->ptr]     = *input++;
120
121          resamp->buffer_r[resamp->ptr + taps] =
122             resamp->buffer_r[resamp->ptr]     = *input++;
123
124          resamp->time                        -= phases;
125          frames--;
126       }
127
128       {
129          const float *buffer_l    = resamp->buffer_l + resamp->ptr;
130          const float *buffer_r    = resamp->buffer_r + resamp->ptr;
131          while (resamp->time < phases)
132          {
133             unsigned phase           = resamp->time >> resamp->subphase_bits;
134             const float *phase_table = resamp->phase_table + phase * taps * 2;
135             const float *delta_table = phase_table + taps;
136             float32x4_t delta        = vdupq_n_f32((resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
137             int i;
138             float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
139             float32x2_t p3, p4;
140
141             for (i = 0; i < (int)taps; i += 8)
142             {
143                float32x4x2_t coeff8  = vld2q_f32(&phase_table[i]);
144                float32x4x2_t delta8  = vld2q_f32(&delta_table[i]);
145                float32x4x2_t left8   = vld2q_f32(&buffer_l[i]);
146                float32x4x2_t right8  = vld2q_f32(&buffer_r[i]);
147
148                coeff8.val[0] = vmlaq_f32(coeff8.val[0], delta8.val[0], delta);
149                coeff8.val[1] = vmlaq_f32(coeff8.val[1], delta8.val[1], delta);
150
151                p1 = vmlaq_f32(p1,  left8.val[0], coeff8.val[0]);
152                p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
153                p1 = vmlaq_f32(p1,  left8.val[1], coeff8.val[1]);
154                p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
155             }
156
157             p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
158             p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
159             vst1_f32(output, vpadd_f32(p3, p4));
160             output                 += 2;
161             out_frames++;
162             resamp->time           += ratio;
163          }
164       }
165    }
166
167    data->output_frames = out_frames;
168 }
169 #endif
170
171 /* Assumes that taps >= 8, and that taps is a multiple of 8. */
172 static void resampler_sinc_process_neon(void *re_, struct resampler_data *data)
173 {
174    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
175    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
176    uint32_t ratio                 = phases / data->ratio;
177    const float *input             = data->data_in;
178    float *output                  = data->data_out;
179    size_t frames                  = data->input_frames;
180    size_t out_frames              = 0;
181    unsigned taps                  = resamp->taps;
182
183    while (frames)
184    {
185       while (frames && resamp->time >= phases)
186       {
187          /* Push in reverse to make filter more obvious. */
188          if (!resamp->ptr)
189             resamp->ptr = taps;
190          resamp->ptr--;
191
192          resamp->buffer_l[resamp->ptr + taps] =
193             resamp->buffer_l[resamp->ptr]     = *input++;
194
195          resamp->buffer_r[resamp->ptr + taps] =
196             resamp->buffer_r[resamp->ptr]     = *input++;
197
198          resamp->time                        -= phases;
199          frames--;
200       }
201
202       {
203          const float *buffer_l    = resamp->buffer_l + resamp->ptr;
204          const float *buffer_r    = resamp->buffer_r + resamp->ptr;
205          while (resamp->time < phases)
206          {
207             unsigned phase           = resamp->time >> resamp->subphase_bits;
208             const float *phase_table = resamp->phase_table + phase * taps;
209 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
210             process_sinc_neon_asm(output, buffer_l, buffer_r, phase_table, taps);
211 #else
212             int i;
213             float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
214             float32x2_t p3, p4;
215
216             for (i = 0; i < (int)taps; i += 8)
217             {
218                float32x4x2_t coeff8  = vld2q_f32(&phase_table[i]);
219                float32x4x2_t left8   = vld2q_f32(&buffer_l[i]);
220                float32x4x2_t right8  = vld2q_f32(&buffer_r[i]);
221
222                p1 = vmlaq_f32(p1,  left8.val[0], coeff8.val[0]);
223                p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
224                p1 = vmlaq_f32(p1,  left8.val[1], coeff8.val[1]);
225                p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
226             }
227
228             p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
229             p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
230             vst1_f32(output, vpadd_f32(p3, p4));
231 #endif
232             output                 += 2;
233             out_frames++;
234             resamp->time           += ratio;
235          }
236       }
237    }
238
239    data->output_frames = out_frames;
240 }
241 #endif
242
243 #if defined(__AVX__)
244 static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data)
245 {
246    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
247    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
248
249    uint32_t ratio                 = phases / data->ratio;
250    const float *input             = data->data_in;
251    float *output                  = data->data_out;
252    size_t frames                  = data->input_frames;
253    size_t out_frames              = 0;
254    unsigned taps                  = resamp->taps;
255
256    {
257       while (frames)
258       {
259          while (frames && resamp->time >= phases)
260          {
261             /* Push in reverse to make filter more obvious. */
262             if (!resamp->ptr)
263                resamp->ptr = taps;
264             resamp->ptr--;
265
266             resamp->buffer_l[resamp->ptr + taps] =
267                resamp->buffer_l[resamp->ptr]     = *input++;
268
269             resamp->buffer_r[resamp->ptr + taps] =
270                resamp->buffer_r[resamp->ptr]     = *input++;
271
272             resamp->time                                -= phases;
273             frames--;
274          }
275
276          {
277             const float *buffer_l    = resamp->buffer_l + resamp->ptr;
278             const float *buffer_r    = resamp->buffer_r + resamp->ptr;
279             while (resamp->time < phases)
280             {
281                int i;
282                unsigned phase           = resamp->time >> resamp->subphase_bits;
283
284                float *phase_table       = resamp->phase_table + phase * taps * 2;
285                float *delta_table       = phase_table + taps;
286                __m256 delta             = _mm256_set1_ps((float)
287                      (resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
288
289                __m256 sum_l             = _mm256_setzero_ps();
290                __m256 sum_r             = _mm256_setzero_ps();
291
292                for (i = 0; i < (int)taps; i += 8)
293                {
294                   __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
295                   __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);
296                   __m256 deltas = _mm256_load_ps(delta_table + i);
297                   __m256 sinc   = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i),
298                         _mm256_mul_ps(deltas, delta));
299
300                   sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
301                   sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
302                }
303
304                /* hadd on AVX is weird, and acts on low-lanes
305                 * and high-lanes separately. */
306                __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
307                __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
308                res_l        = _mm256_hadd_ps(res_l, res_l);
309                res_r        = _mm256_hadd_ps(res_r, res_r);
310                res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
311                res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
312
313                /* This is optimized to mov %xmmN, [mem].
314                 * There doesn't seem to be any _mm256_store_ss intrinsic. */
315                _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
316                _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
317
318                output += 2;
319                out_frames++;
320                resamp->time += ratio;
321             }
322          }
323       }
324    }
325
326    data->output_frames = out_frames;
327 }
328
329 static void resampler_sinc_process_avx(void *re_, struct resampler_data *data)
330 {
331    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
332    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
333
334    uint32_t ratio                 = phases / data->ratio;
335    const float *input             = data->data_in;
336    float *output                  = data->data_out;
337    size_t frames                  = data->input_frames;
338    size_t out_frames              = 0;
339    unsigned taps                  = resamp->taps;
340
341    {
342       while (frames)
343       {
344          while (frames && resamp->time >= phases)
345          {
346             /* Push in reverse to make filter more obvious. */
347             if (!resamp->ptr)
348                resamp->ptr = taps;
349             resamp->ptr--;
350
351             resamp->buffer_l[resamp->ptr + taps] =
352                resamp->buffer_l[resamp->ptr]     = *input++;
353
354             resamp->buffer_r[resamp->ptr + taps] =
355                resamp->buffer_r[resamp->ptr]     = *input++;
356
357             resamp->time                        -= phases;
358             frames--;
359          }
360
361          {
362             const float *buffer_l    = resamp->buffer_l + resamp->ptr;
363             const float *buffer_r    = resamp->buffer_r + resamp->ptr;
364             while (resamp->time < phases)
365             {
366                int i;
367                __m256 delta;
368                unsigned phase           = resamp->time >> resamp->subphase_bits;
369                float *phase_table       = resamp->phase_table + phase * taps;
370
371                __m256 sum_l             = _mm256_setzero_ps();
372                __m256 sum_r             = _mm256_setzero_ps();
373
374                for (i = 0; i < (int)taps; i += 8)
375                {
376                   __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
377                   __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);
378                   __m256 sinc   = _mm256_load_ps((const float*)phase_table + i);
379
380                   sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
381                   sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
382                }
383
384                /* hadd on AVX is weird, and acts on low-lanes
385                 * and high-lanes separately. */
386                __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
387                __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
388                res_l        = _mm256_hadd_ps(res_l, res_l);
389                res_r        = _mm256_hadd_ps(res_r, res_r);
390                res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
391                res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
392
393                /* This is optimized to mov %xmmN, [mem].
394                 * There doesn't seem to be any _mm256_store_ss intrinsic. */
395                _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
396                _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
397
398                output += 2;
399                out_frames++;
400                resamp->time += ratio;
401             }
402          }
403       }
404    }
405
406    data->output_frames = out_frames;
407 }
408 #endif
409
410 #if defined(__SSE__)
411 static void resampler_sinc_process_sse_kaiser(void *re_, struct resampler_data *data)
412 {
413    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
414    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
415
416    uint32_t ratio                 = phases / data->ratio;
417    const float *input             = data->data_in;
418    float *output                  = data->data_out;
419    size_t frames                  = data->input_frames;
420    size_t out_frames              = 0;
421    unsigned taps                  = resamp->taps;
422
423    {
424       while (frames)
425       {
426          while (frames && resamp->time >= phases)
427          {
428             /* Push in reverse to make filter more obvious. */
429             if (!resamp->ptr)
430                resamp->ptr = taps;
431             resamp->ptr--;
432
433             resamp->buffer_l[resamp->ptr + taps] =
434                resamp->buffer_l[resamp->ptr]     = *input++;
435
436             resamp->buffer_r[resamp->ptr + taps] =
437                resamp->buffer_r[resamp->ptr]     = *input++;
438
439             resamp->time                        -= phases;
440             frames--;
441          }
442
443          {
444             const float *buffer_l    = resamp->buffer_l + resamp->ptr;
445             const float *buffer_r    = resamp->buffer_r + resamp->ptr;
446             while (resamp->time < phases)
447             {
448                int i;
449                __m128 sum;
450                unsigned phase           = resamp->time >> resamp->subphase_bits;
451                float *phase_table       = resamp->phase_table + phase * taps * 2;
452                float *delta_table       = phase_table + taps;
453                __m128 delta             = _mm_set1_ps((float)
454                      (resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
455
456                __m128 sum_l             = _mm_setzero_ps();
457                __m128 sum_r             = _mm_setzero_ps();
458
459                for (i = 0; i < (int)taps; i += 4)
460                {
461                   __m128 buf_l = _mm_loadu_ps(buffer_l + i);
462                   __m128 buf_r = _mm_loadu_ps(buffer_r + i);
463                   __m128 deltas = _mm_load_ps(delta_table + i);
464                   __m128 _sinc  = _mm_add_ps(_mm_load_ps((const float*)phase_table + i),
465                         _mm_mul_ps(deltas, delta));
466                   sum_l        = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
467                   sum_r        = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
468                }
469
470                /* Them annoying shuffles.
471                 * sum_l = { l3, l2, l1, l0 }
472                 * sum_r = { r3, r2, r1, r0 }
473                 */
474
475                sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
476                         _MM_SHUFFLE(1, 0, 1, 0)),
477                      _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
478
479                /* sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
480                 * sum   = { R1, R0, L1, L0 }
481                 */
482
483                sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
484
485                /* sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
486                 * sum   = { X,  R,  X,  L }
487                 */
488
489                /* Store L */
490                _mm_store_ss(output + 0, sum);
491
492                /* movehl { X, R, X, L } == { X, R, X, R } */
493                _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
494
495                output += 2;
496                out_frames++;
497                resamp->time += ratio;
498             }
499          }
500       }
501    }
502
503    data->output_frames = out_frames;
504 }
505
506 static void resampler_sinc_process_sse(void *re_, struct resampler_data *data)
507 {
508    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
509    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
510
511    uint32_t ratio                 = phases / data->ratio;
512    const float *input             = data->data_in;
513    float *output                  = data->data_out;
514    size_t frames                  = data->input_frames;
515    size_t out_frames              = 0;
516    unsigned taps                  = resamp->taps;
517
518    {
519       while (frames)
520       {
521          while (frames && resamp->time >= phases)
522          {
523             /* Push in reverse to make filter more obvious. */
524             if (!resamp->ptr)
525                resamp->ptr = taps;
526             resamp->ptr--;
527
528             resamp->buffer_l[resamp->ptr + taps] =
529                resamp->buffer_l[resamp->ptr]     = *input++;
530
531             resamp->buffer_r[resamp->ptr + taps] =
532                resamp->buffer_r[resamp->ptr]     = *input++;
533
534             resamp->time                        -= phases;
535             frames--;
536          }
537
538          {
539             const float *buffer_l    = resamp->buffer_l + resamp->ptr;
540             const float *buffer_r    = resamp->buffer_r + resamp->ptr;
541             while (resamp->time < phases)
542             {
543                int i;
544                __m128 sum;
545                unsigned phase           = resamp->time >> resamp->subphase_bits;
546                float *phase_table       = resamp->phase_table + phase * taps;
547
548                __m128 sum_l             = _mm_setzero_ps();
549                __m128 sum_r             = _mm_setzero_ps();
550
551                for (i = 0; i < (int)taps; i += 4)
552                {
553                   __m128 buf_l = _mm_loadu_ps(buffer_l + i);
554                   __m128 buf_r = _mm_loadu_ps(buffer_r + i);
555                   __m128 _sinc = _mm_load_ps((const float*)phase_table + i);
556                   sum_l        = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
557                   sum_r        = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
558                }
559
560                /* Them annoying shuffles.
561                 * sum_l = { l3, l2, l1, l0 }
562                 * sum_r = { r3, r2, r1, r0 }
563                 */
564
565                sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
566                         _MM_SHUFFLE(1, 0, 1, 0)),
567                      _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
568
569                /* sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
570                 * sum   = { R1, R0, L1, L0 }
571                 */
572
573                sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
574
575                /* sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
576                 * sum   = { X,  R,  X,  L }
577                 */
578
579                /* Store L */
580                _mm_store_ss(output + 0, sum);
581
582                /* movehl { X, R, X, L } == { X, R, X, R } */
583                _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
584
585                output += 2;
586                out_frames++;
587                resamp->time += ratio;
588             }
589          }
590       }
591    }
592
593    data->output_frames = out_frames;
594 }
595 #endif
596
597 static void resampler_sinc_process_c_kaiser(void *re_, struct resampler_data *data)
598 {
599    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
600    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
601
602    uint32_t ratio                 = phases / data->ratio;
603    const float *input             = data->data_in;
604    float *output                  = data->data_out;
605    size_t frames                  = data->input_frames;
606    size_t out_frames              = 0;
607    unsigned taps                  = resamp->taps;
608
609    {
610       while (frames)
611       {
612          while (frames && resamp->time >= phases)
613          {
614             /* Push in reverse to make filter more obvious. */
615             if (!resamp->ptr)
616                resamp->ptr = taps;
617             resamp->ptr--;
618
619             resamp->buffer_l[resamp->ptr + taps]    =
620                resamp->buffer_l[resamp->ptr]        = *input++;
621
622             resamp->buffer_r[resamp->ptr + taps]    =
623                resamp->buffer_r[resamp->ptr]        = *input++;
624
625             resamp->time                           -= phases;
626             frames--;
627          }
628
629          {
630             const float *buffer_l    = resamp->buffer_l + resamp->ptr;
631             const float *buffer_r    = resamp->buffer_r + resamp->ptr;
632             while (resamp->time < phases)
633             {
634                int i;
635                float sum_l              = 0.0f;
636                float sum_r              = 0.0f;
637                unsigned phase           = resamp->time >> resamp->subphase_bits;
638                float *phase_table       = resamp->phase_table + phase * taps * 2;
639                float *delta_table       = phase_table + taps;
640                float delta              = (float)
641                   (resamp->time & resamp->subphase_mask) * resamp->subphase_mod;
642
643                for (i = 0; i < (int)taps; i++)
644                {
645                   float sinc_val        = phase_table[i] + delta_table[i] * delta;
646
647                   sum_l                += buffer_l[i] * sinc_val;
648                   sum_r                += buffer_r[i] * sinc_val;
649                }
650
651                output[0]                = sum_l;
652                output[1]                = sum_r;
653
654                output                  += 2;
655                out_frames++;
656                resamp->time            += ratio;
657             }
658          }
659
660       }
661    }
662
663    data->output_frames = out_frames;
664 }
665
666 static void resampler_sinc_process_c(void *re_, struct resampler_data *data)
667 {
668    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
669    unsigned phases                = 1 << (resamp->phase_bits + resamp->subphase_bits);
670
671    uint32_t ratio                 = phases / data->ratio;
672    const float *input             = data->data_in;
673    float *output                  = data->data_out;
674    size_t frames                  = data->input_frames;
675    size_t out_frames              = 0;
676    unsigned taps                  = resamp->taps;
677
678    {
679       while (frames)
680       {
681          while (frames && resamp->time >= phases)
682          {
683             /* Push in reverse to make filter more obvious. */
684             if (!resamp->ptr)
685                resamp->ptr = taps;
686             resamp->ptr--;
687
688             resamp->buffer_l[resamp->ptr + taps]    =
689                resamp->buffer_l[resamp->ptr]        = *input++;
690
691             resamp->buffer_r[resamp->ptr + taps]    =
692                resamp->buffer_r[resamp->ptr]        = *input++;
693
694             resamp->time                           -= phases;
695             frames--;
696          }
697
698          {
699             const float *buffer_l    = resamp->buffer_l + resamp->ptr;
700             const float *buffer_r    = resamp->buffer_r + resamp->ptr;
701             while (resamp->time < phases)
702             {
703                int i;
704                float sum_l              = 0.0f;
705                float sum_r              = 0.0f;
706                unsigned phase           = resamp->time >> resamp->subphase_bits;
707                float *phase_table       = resamp->phase_table + phase * taps;
708
709                for (i = 0; i < (int)taps; i++)
710                {
711                   float sinc_val        = phase_table[i];
712
713                   sum_l                += buffer_l[i] * sinc_val;
714                   sum_r                += buffer_r[i] * sinc_val;
715                }
716
717                output[0]                = sum_l;
718                output[1]                = sum_r;
719
720                output                  += 2;
721                out_frames++;
722                resamp->time            += ratio;
723             }
724          }
725
726       }
727    }
728
729    data->output_frames = out_frames;
730 }
731
732 static void resampler_sinc_free(void *data)
733 {
734    rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)data;
735    if (resamp)
736       memalign_free(resamp->main_buffer);
737    free(resamp);
738 }
739
740 static void sinc_init_table_kaiser(rarch_sinc_resampler_t *resamp,
741       double cutoff,
742       float *phase_table, int phases, int taps, bool calculate_delta)
743 {
744    int i, j;
745    /* Kaiser window function - need to normalize w(0) to 1.0f */
746    float kaiser_beta    = resamp->kaiser_beta;
747    double    window_mod = besseli0(kaiser_beta);
748    int           stride = calculate_delta ? 2 : 1;
749    double     sidelobes = taps / 2.0;
750
751    for (i = 0; i < phases; i++)
752    {
753       for (j = 0; j < taps; j++)
754       {
755          float val;
756          double sinc_phase;
757          int               n = j * phases + i;
758          double window_phase = (double)n / (phases * taps); /* [0, 1). */
759          window_phase        = 2.0 * window_phase - 1.0; /* [-1, 1) */
760          sinc_phase          = sidelobes * window_phase;
761          val                 = cutoff * sinc(M_PI * sinc_phase * cutoff) *
762               besseli0(kaiser_beta * sqrtf(1 - window_phase * window_phase))
763             / window_mod;
764          phase_table[i * stride * taps + j] = val;
765       }
766    }
767
768    if (calculate_delta)
769    {
770       int phase;
771       int p;
772
773       for (p = 0; p < phases - 1; p++)
774       {
775          for (j = 0; j < taps; j++)
776          {
777             float delta = phase_table[(p + 1) * stride * taps + j] -
778                phase_table[p * stride * taps + j];
779             phase_table[(p * stride + 1) * taps + j] = delta;
780          }
781       }
782
783       phase = phases - 1;
784       for (j = 0; j < taps; j++)
785       {
786          float val, delta;
787          double sinc_phase;
788          int n               = j * phases + (phase + 1);
789          double window_phase = (double)n / (phases * taps); /* (0, 1]. */
790          window_phase        = 2.0 * window_phase - 1.0; /* (-1, 1] */
791          sinc_phase          = sidelobes * window_phase;
792
793          val                 = cutoff * sinc(M_PI * sinc_phase * cutoff) *
794               besseli0(resamp->kaiser_beta * sqrtf(1 - window_phase *
795 window_phase)) / window_mod;
796          delta = (val - phase_table[phase * stride * taps + j]);
797          phase_table[(phase * stride + 1) * taps + j] = delta;
798       }
799    }
800 }
801
802 static void sinc_init_table_lanczos(
803       rarch_sinc_resampler_t *resamp, double cutoff,
804       float *phase_table, int phases, int taps, bool calculate_delta)
805 {
806    int i, j;
807    /* Lanczos window function - need to normalize w(0) to 1.0f */
808    double    window_mod = 1.0;
809    int           stride = calculate_delta ? 2 : 1;
810    double     sidelobes = taps / 2.0;
811
812    for (i = 0; i < phases; i++)
813    {
814       for (j = 0; j < taps; j++)
815       {
816          double sinc_phase;
817          float val;
818          int               n = j * phases + i;
819          double window_phase = (double)n / (phases * taps); /* [0, 1). */
820          window_phase        = 2.0 * window_phase - 1.0; /* [-1, 1) */
821          sinc_phase          = sidelobes * window_phase;
822          val                 = cutoff * sinc(M_PI * sinc_phase * cutoff) *
823             sinc(M_PI * window_phase) / window_mod;
824          phase_table[i * stride * taps + j] = val;
825       }
826    }
827
828    if (calculate_delta)
829    {
830       int phase;
831       int p;
832
833       for (p = 0; p < phases - 1; p++)
834       {
835          for (j = 0; j < taps; j++)
836          {
837             float delta = phase_table[(p + 1) * stride * taps + j] -
838                phase_table[p * stride * taps + j];
839             phase_table[(p * stride + 1) * taps + j] = delta;
840          }
841       }
842
843       phase = phases - 1;
844       for (j = 0; j < taps; j++)
845       {
846          float val, delta;
847          double sinc_phase;
848          int n               = j * phases + (phase + 1);
849          double window_phase = (double)n / (phases * taps); /* (0, 1]. */
850          window_phase        = 2.0 * window_phase - 1.0; /* (-1, 1] */
851          sinc_phase          = sidelobes * window_phase;
852
853          val                 = cutoff * sinc(M_PI * sinc_phase * cutoff) *
854             sinc(M_PI * window_phase) / window_mod;
855          delta = (val - phase_table[phase * stride * taps + j]);
856          phase_table[(phase * stride + 1) * taps + j] = delta;
857       }
858    }
859 }
860
861 static void *resampler_sinc_new(const struct resampler_config *config,
862       double bandwidth_mod, enum resampler_quality quality,
863       resampler_simd_mask_t mask)
864 {
865    double cutoff                  = 0.0;
866    size_t phase_elems             = 0;
867    size_t elems                   = 0;
868    unsigned enable_avx            = 0;
869    unsigned sidelobes             = 0;
870    enum sinc_window window_type   = SINC_WINDOW_NONE;
871    rarch_sinc_resampler_t *re     = (rarch_sinc_resampler_t*)
872       calloc(1, sizeof(*re));
873
874    if (!re)
875       return NULL;
876
877    switch (quality)
878    {
879       case RESAMPLER_QUALITY_LOWEST:
880          cutoff            = 0.98;
881          sidelobes         = 2;
882          re->phase_bits    = 12;
883          re->subphase_bits = 10;
884          window_type       = SINC_WINDOW_LANCZOS;
885          break;
886       case RESAMPLER_QUALITY_LOWER:
887          cutoff            = 0.98;
888          sidelobes         = 4;
889          re->phase_bits    = 12;
890          re->subphase_bits = 10;
891          window_type       = SINC_WINDOW_LANCZOS;
892          break;
893       case RESAMPLER_QUALITY_HIGHER:
894          cutoff            = 0.90;
895          sidelobes         = 32;
896          re->phase_bits    = 10;
897          re->subphase_bits = 14;
898          window_type       = SINC_WINDOW_KAISER;
899          re->kaiser_beta   = 10.5;
900          enable_avx        = 1;
901          break;
902       case RESAMPLER_QUALITY_HIGHEST:
903          cutoff            = 0.962;
904          sidelobes         = 128;
905          re->phase_bits    = 10;
906          re->subphase_bits = 14;
907          window_type       = SINC_WINDOW_KAISER;
908          re->kaiser_beta   = 14.5;
909          enable_avx        = 1;
910          break;
911       case RESAMPLER_QUALITY_NORMAL:
912       case RESAMPLER_QUALITY_DONTCARE:
913          cutoff            = 0.825;
914          sidelobes         = 8;
915          re->phase_bits    = 8;
916          re->subphase_bits = 16;
917          window_type       = SINC_WINDOW_KAISER;
918          re->kaiser_beta   = 5.5;
919          break;
920    }
921
922    re->subphase_mask = (1 << re->subphase_bits) - 1;
923    re->subphase_mod  = 1.0f / (1 << re->subphase_bits);
924    re->taps          = sidelobes * 2;
925
926    /* Downsampling, must lower cutoff, and extend number of
927     * taps accordingly to keep same stopband attenuation. */
928    if (bandwidth_mod < 1.0)
929    {
930       cutoff  *= bandwidth_mod;
931       re->taps = (unsigned)ceil(re->taps / bandwidth_mod);
932    }
933
934    /* Be SIMD-friendly. */
935 #if defined(__AVX__)
936    if (enable_avx)
937       re->taps  = (re->taps + 7) & ~7;
938    else
939 #endif
940    {
941 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
942       re->taps     = (re->taps + 7) & ~7;
943 #else
944       re->taps     = (re->taps + 3) & ~3;
945 #endif
946    }
947
948    phase_elems     = ((1 << re->phase_bits) * re->taps);
949    if (window_type == SINC_WINDOW_KAISER)
950       phase_elems  = phase_elems * 2;
951    elems           = phase_elems + 4 * re->taps;
952
953    re->main_buffer = (float*)memalign_alloc(128, sizeof(float) * elems);
954    if (!re->main_buffer)
955       goto error;
956
957    memset(re->main_buffer, 0, sizeof(float) * elems);
958
959    re->phase_table = re->main_buffer;
960    re->buffer_l    = re->main_buffer + phase_elems;
961    re->buffer_r    = re->buffer_l + 2 * re->taps;
962
963    switch (window_type)
964    {
965       case SINC_WINDOW_LANCZOS:
966          sinc_init_table_lanczos(re, cutoff, re->phase_table,
967                1 << re->phase_bits, re->taps, false);
968          break;
969       case SINC_WINDOW_KAISER:
970          sinc_init_table_kaiser(re, cutoff, re->phase_table,
971                1 << re->phase_bits, re->taps, true);
972          break;
973       case SINC_WINDOW_NONE:
974          goto error;
975    }
976
977    sinc_resampler.process = resampler_sinc_process_c;
978    if (window_type == SINC_WINDOW_KAISER)
979       sinc_resampler.process    = resampler_sinc_process_c_kaiser;
980
981    if (mask & RESAMPLER_SIMD_AVX && enable_avx)
982    {
983 #if defined(__AVX__)
984       sinc_resampler.process    = resampler_sinc_process_avx;
985       if (window_type == SINC_WINDOW_KAISER)
986          sinc_resampler.process = resampler_sinc_process_avx_kaiser;
987 #endif
988    }
989    else if (mask & RESAMPLER_SIMD_SSE)
990    {
991 #if defined(__SSE__)
992       sinc_resampler.process = resampler_sinc_process_sse;
993       if (window_type == SINC_WINDOW_KAISER)
994          sinc_resampler.process = resampler_sinc_process_sse_kaiser;
995 #endif
996    }
997    else if (mask & RESAMPLER_SIMD_NEON)
998    {
999 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
1000 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
1001       if (window_type != SINC_WINDOW_KAISER)
1002          sinc_resampler.process = resampler_sinc_process_neon;
1003 #else
1004       sinc_resampler.process = resampler_sinc_process_neon;
1005       if (window_type == SINC_WINDOW_KAISER)
1006          sinc_resampler.process = resampler_sinc_process_neon_kaiser;
1007 #endif
1008 #endif
1009    }
1010
1011    return re;
1012
1013 error:
1014    resampler_sinc_free(re);
1015    return NULL;
1016 }
1017
1018 retro_resampler_t sinc_resampler = {
1019    resampler_sinc_new,
1020    resampler_sinc_process_c,
1021    resampler_sinc_free,
1022    RESAMPLER_API_VERSION,
1023    "sinc",
1024    "sinc"
1025 };