1 /* Copyright (C) 2010-2020 The RetroArch team
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (sinc_resampler.c).
5 * ---------------------------------------------------------------------------------------
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 /* Bog-standard windowed SINC implementation. */
30 #include <retro_environment.h>
31 #include <retro_inline.h>
35 #include <audio/audio_resampler.h>
39 #include <xmmintrin.h>
43 #include <immintrin.h>
46 /* Rough SNR values for upsampling:
54 /* TODO, make all this more configurable. */
63 /* For the little amount of taps we're using,
64 * SSE1 is faster than AVX for some reason.
65 * AVX code is kept here though as by increasing number
66 * of sinc taps, the AVX code is clearly faster than SSE1.
69 typedef struct rarch_sinc_resampler
71 /* A buffer for phase_table, buffer_l and buffer_r
72 * are created in a single calloc().
73 * Ensure that we get as good cache locality as we can hope for. */
79 unsigned subphase_bits;
80 unsigned subphase_mask;
86 } rarch_sinc_resampler_t;
88 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
90 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
91 void process_sinc_neon_asm(float *out, const float *left,
92 const float *right, const float *coeff, unsigned taps);
96 /* Assumes that taps >= 8, and that taps is a multiple of 8.
97 * Not bothering to reimplement this one for the external .S
99 static void resampler_sinc_process_neon_kaiser(void *re_, struct resampler_data *data)
101 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
102 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
103 uint32_t ratio = phases / data->ratio;
104 const float *input = data->data_in;
105 float *output = data->data_out;
106 size_t frames = data->input_frames;
107 size_t out_frames = 0;
108 unsigned taps = resamp->taps;
111 while (frames && resamp->time >= phases)
113 /* Push in reverse to make filter more obvious. */
118 resamp->buffer_l[resamp->ptr + taps] =
119 resamp->buffer_l[resamp->ptr] = *input++;
121 resamp->buffer_r[resamp->ptr + taps] =
122 resamp->buffer_r[resamp->ptr] = *input++;
124 resamp->time -= phases;
129 const float *buffer_l = resamp->buffer_l + resamp->ptr;
130 const float *buffer_r = resamp->buffer_r + resamp->ptr;
131 while (resamp->time < phases)
133 unsigned phase = resamp->time >> resamp->subphase_bits;
134 const float *phase_table = resamp->phase_table + phase * taps * 2;
135 const float *delta_table = phase_table + taps;
136 float32x4_t delta = vdupq_n_f32((resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
138 float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
141 for (i = 0; i < (int)taps; i += 8)
143 float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]);
144 float32x4x2_t delta8 = vld2q_f32(&delta_table[i]);
145 float32x4x2_t left8 = vld2q_f32(&buffer_l[i]);
146 float32x4x2_t right8 = vld2q_f32(&buffer_r[i]);
148 coeff8.val[0] = vmlaq_f32(coeff8.val[0], delta8.val[0], delta);
149 coeff8.val[1] = vmlaq_f32(coeff8.val[1], delta8.val[1], delta);
151 p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]);
152 p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
153 p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]);
154 p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
157 p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
158 p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
159 vst1_f32(output, vpadd_f32(p3, p4));
162 resamp->time += ratio;
167 data->output_frames = out_frames;
171 /* Assumes that taps >= 8, and that taps is a multiple of 8. */
172 static void resampler_sinc_process_neon(void *re_, struct resampler_data *data)
174 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
175 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
176 uint32_t ratio = phases / data->ratio;
177 const float *input = data->data_in;
178 float *output = data->data_out;
179 size_t frames = data->input_frames;
180 size_t out_frames = 0;
181 unsigned taps = resamp->taps;
185 while (frames && resamp->time >= phases)
187 /* Push in reverse to make filter more obvious. */
192 resamp->buffer_l[resamp->ptr + taps] =
193 resamp->buffer_l[resamp->ptr] = *input++;
195 resamp->buffer_r[resamp->ptr + taps] =
196 resamp->buffer_r[resamp->ptr] = *input++;
198 resamp->time -= phases;
203 const float *buffer_l = resamp->buffer_l + resamp->ptr;
204 const float *buffer_r = resamp->buffer_r + resamp->ptr;
205 while (resamp->time < phases)
207 unsigned phase = resamp->time >> resamp->subphase_bits;
208 const float *phase_table = resamp->phase_table + phase * taps;
209 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
210 process_sinc_neon_asm(output, buffer_l, buffer_r, phase_table, taps);
213 float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
216 for (i = 0; i < (int)taps; i += 8)
218 float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]);
219 float32x4x2_t left8 = vld2q_f32(&buffer_l[i]);
220 float32x4x2_t right8 = vld2q_f32(&buffer_r[i]);
222 p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]);
223 p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
224 p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]);
225 p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
228 p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
229 p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
230 vst1_f32(output, vpadd_f32(p3, p4));
234 resamp->time += ratio;
239 data->output_frames = out_frames;
244 static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data)
246 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
247 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
249 uint32_t ratio = phases / data->ratio;
250 const float *input = data->data_in;
251 float *output = data->data_out;
252 size_t frames = data->input_frames;
253 size_t out_frames = 0;
254 unsigned taps = resamp->taps;
259 while (frames && resamp->time >= phases)
261 /* Push in reverse to make filter more obvious. */
266 resamp->buffer_l[resamp->ptr + taps] =
267 resamp->buffer_l[resamp->ptr] = *input++;
269 resamp->buffer_r[resamp->ptr + taps] =
270 resamp->buffer_r[resamp->ptr] = *input++;
272 resamp->time -= phases;
277 const float *buffer_l = resamp->buffer_l + resamp->ptr;
278 const float *buffer_r = resamp->buffer_r + resamp->ptr;
279 while (resamp->time < phases)
282 unsigned phase = resamp->time >> resamp->subphase_bits;
284 float *phase_table = resamp->phase_table + phase * taps * 2;
285 float *delta_table = phase_table + taps;
286 __m256 delta = _mm256_set1_ps((float)
287 (resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
289 __m256 sum_l = _mm256_setzero_ps();
290 __m256 sum_r = _mm256_setzero_ps();
292 for (i = 0; i < (int)taps; i += 8)
294 __m256 buf_l = _mm256_loadu_ps(buffer_l + i);
295 __m256 buf_r = _mm256_loadu_ps(buffer_r + i);
296 __m256 deltas = _mm256_load_ps(delta_table + i);
297 __m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i),
298 _mm256_mul_ps(deltas, delta));
300 sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
301 sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
304 /* hadd on AVX is weird, and acts on low-lanes
305 * and high-lanes separately. */
306 __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
307 __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
308 res_l = _mm256_hadd_ps(res_l, res_l);
309 res_r = _mm256_hadd_ps(res_r, res_r);
310 res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
311 res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
313 /* This is optimized to mov %xmmN, [mem].
314 * There doesn't seem to be any _mm256_store_ss intrinsic. */
315 _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
316 _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
320 resamp->time += ratio;
326 data->output_frames = out_frames;
329 static void resampler_sinc_process_avx(void *re_, struct resampler_data *data)
331 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
332 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
334 uint32_t ratio = phases / data->ratio;
335 const float *input = data->data_in;
336 float *output = data->data_out;
337 size_t frames = data->input_frames;
338 size_t out_frames = 0;
339 unsigned taps = resamp->taps;
344 while (frames && resamp->time >= phases)
346 /* Push in reverse to make filter more obvious. */
351 resamp->buffer_l[resamp->ptr + taps] =
352 resamp->buffer_l[resamp->ptr] = *input++;
354 resamp->buffer_r[resamp->ptr + taps] =
355 resamp->buffer_r[resamp->ptr] = *input++;
357 resamp->time -= phases;
362 const float *buffer_l = resamp->buffer_l + resamp->ptr;
363 const float *buffer_r = resamp->buffer_r + resamp->ptr;
364 while (resamp->time < phases)
368 unsigned phase = resamp->time >> resamp->subphase_bits;
369 float *phase_table = resamp->phase_table + phase * taps;
371 __m256 sum_l = _mm256_setzero_ps();
372 __m256 sum_r = _mm256_setzero_ps();
374 for (i = 0; i < (int)taps; i += 8)
376 __m256 buf_l = _mm256_loadu_ps(buffer_l + i);
377 __m256 buf_r = _mm256_loadu_ps(buffer_r + i);
378 __m256 sinc = _mm256_load_ps((const float*)phase_table + i);
380 sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
381 sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
384 /* hadd on AVX is weird, and acts on low-lanes
385 * and high-lanes separately. */
386 __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
387 __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
388 res_l = _mm256_hadd_ps(res_l, res_l);
389 res_r = _mm256_hadd_ps(res_r, res_r);
390 res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
391 res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
393 /* This is optimized to mov %xmmN, [mem].
394 * There doesn't seem to be any _mm256_store_ss intrinsic. */
395 _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
396 _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
400 resamp->time += ratio;
406 data->output_frames = out_frames;
411 static void resampler_sinc_process_sse_kaiser(void *re_, struct resampler_data *data)
413 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
414 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
416 uint32_t ratio = phases / data->ratio;
417 const float *input = data->data_in;
418 float *output = data->data_out;
419 size_t frames = data->input_frames;
420 size_t out_frames = 0;
421 unsigned taps = resamp->taps;
426 while (frames && resamp->time >= phases)
428 /* Push in reverse to make filter more obvious. */
433 resamp->buffer_l[resamp->ptr + taps] =
434 resamp->buffer_l[resamp->ptr] = *input++;
436 resamp->buffer_r[resamp->ptr + taps] =
437 resamp->buffer_r[resamp->ptr] = *input++;
439 resamp->time -= phases;
444 const float *buffer_l = resamp->buffer_l + resamp->ptr;
445 const float *buffer_r = resamp->buffer_r + resamp->ptr;
446 while (resamp->time < phases)
450 unsigned phase = resamp->time >> resamp->subphase_bits;
451 float *phase_table = resamp->phase_table + phase * taps * 2;
452 float *delta_table = phase_table + taps;
453 __m128 delta = _mm_set1_ps((float)
454 (resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
456 __m128 sum_l = _mm_setzero_ps();
457 __m128 sum_r = _mm_setzero_ps();
459 for (i = 0; i < (int)taps; i += 4)
461 __m128 buf_l = _mm_loadu_ps(buffer_l + i);
462 __m128 buf_r = _mm_loadu_ps(buffer_r + i);
463 __m128 deltas = _mm_load_ps(delta_table + i);
464 __m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i),
465 _mm_mul_ps(deltas, delta));
466 sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
467 sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
470 /* Them annoying shuffles.
471 * sum_l = { l3, l2, l1, l0 }
472 * sum_r = { r3, r2, r1, r0 }
475 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
476 _MM_SHUFFLE(1, 0, 1, 0)),
477 _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
479 /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
480 * sum = { R1, R0, L1, L0 }
483 sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
485 /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
486 * sum = { X, R, X, L }
490 _mm_store_ss(output + 0, sum);
492 /* movehl { X, R, X, L } == { X, R, X, R } */
493 _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
497 resamp->time += ratio;
503 data->output_frames = out_frames;
506 static void resampler_sinc_process_sse(void *re_, struct resampler_data *data)
508 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
509 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
511 uint32_t ratio = phases / data->ratio;
512 const float *input = data->data_in;
513 float *output = data->data_out;
514 size_t frames = data->input_frames;
515 size_t out_frames = 0;
516 unsigned taps = resamp->taps;
521 while (frames && resamp->time >= phases)
523 /* Push in reverse to make filter more obvious. */
528 resamp->buffer_l[resamp->ptr + taps] =
529 resamp->buffer_l[resamp->ptr] = *input++;
531 resamp->buffer_r[resamp->ptr + taps] =
532 resamp->buffer_r[resamp->ptr] = *input++;
534 resamp->time -= phases;
539 const float *buffer_l = resamp->buffer_l + resamp->ptr;
540 const float *buffer_r = resamp->buffer_r + resamp->ptr;
541 while (resamp->time < phases)
545 unsigned phase = resamp->time >> resamp->subphase_bits;
546 float *phase_table = resamp->phase_table + phase * taps;
548 __m128 sum_l = _mm_setzero_ps();
549 __m128 sum_r = _mm_setzero_ps();
551 for (i = 0; i < (int)taps; i += 4)
553 __m128 buf_l = _mm_loadu_ps(buffer_l + i);
554 __m128 buf_r = _mm_loadu_ps(buffer_r + i);
555 __m128 _sinc = _mm_load_ps((const float*)phase_table + i);
556 sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
557 sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
560 /* Them annoying shuffles.
561 * sum_l = { l3, l2, l1, l0 }
562 * sum_r = { r3, r2, r1, r0 }
565 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
566 _MM_SHUFFLE(1, 0, 1, 0)),
567 _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
569 /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
570 * sum = { R1, R0, L1, L0 }
573 sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
575 /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
576 * sum = { X, R, X, L }
580 _mm_store_ss(output + 0, sum);
582 /* movehl { X, R, X, L } == { X, R, X, R } */
583 _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
587 resamp->time += ratio;
593 data->output_frames = out_frames;
597 static void resampler_sinc_process_c_kaiser(void *re_, struct resampler_data *data)
599 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
600 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
602 uint32_t ratio = phases / data->ratio;
603 const float *input = data->data_in;
604 float *output = data->data_out;
605 size_t frames = data->input_frames;
606 size_t out_frames = 0;
607 unsigned taps = resamp->taps;
612 while (frames && resamp->time >= phases)
614 /* Push in reverse to make filter more obvious. */
619 resamp->buffer_l[resamp->ptr + taps] =
620 resamp->buffer_l[resamp->ptr] = *input++;
622 resamp->buffer_r[resamp->ptr + taps] =
623 resamp->buffer_r[resamp->ptr] = *input++;
625 resamp->time -= phases;
630 const float *buffer_l = resamp->buffer_l + resamp->ptr;
631 const float *buffer_r = resamp->buffer_r + resamp->ptr;
632 while (resamp->time < phases)
637 unsigned phase = resamp->time >> resamp->subphase_bits;
638 float *phase_table = resamp->phase_table + phase * taps * 2;
639 float *delta_table = phase_table + taps;
640 float delta = (float)
641 (resamp->time & resamp->subphase_mask) * resamp->subphase_mod;
643 for (i = 0; i < (int)taps; i++)
645 float sinc_val = phase_table[i] + delta_table[i] * delta;
647 sum_l += buffer_l[i] * sinc_val;
648 sum_r += buffer_r[i] * sinc_val;
656 resamp->time += ratio;
663 data->output_frames = out_frames;
666 static void resampler_sinc_process_c(void *re_, struct resampler_data *data)
668 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
669 unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
671 uint32_t ratio = phases / data->ratio;
672 const float *input = data->data_in;
673 float *output = data->data_out;
674 size_t frames = data->input_frames;
675 size_t out_frames = 0;
676 unsigned taps = resamp->taps;
681 while (frames && resamp->time >= phases)
683 /* Push in reverse to make filter more obvious. */
688 resamp->buffer_l[resamp->ptr + taps] =
689 resamp->buffer_l[resamp->ptr] = *input++;
691 resamp->buffer_r[resamp->ptr + taps] =
692 resamp->buffer_r[resamp->ptr] = *input++;
694 resamp->time -= phases;
699 const float *buffer_l = resamp->buffer_l + resamp->ptr;
700 const float *buffer_r = resamp->buffer_r + resamp->ptr;
701 while (resamp->time < phases)
706 unsigned phase = resamp->time >> resamp->subphase_bits;
707 float *phase_table = resamp->phase_table + phase * taps;
709 for (i = 0; i < (int)taps; i++)
711 float sinc_val = phase_table[i];
713 sum_l += buffer_l[i] * sinc_val;
714 sum_r += buffer_r[i] * sinc_val;
722 resamp->time += ratio;
729 data->output_frames = out_frames;
732 static void resampler_sinc_free(void *data)
734 rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)data;
736 memalign_free(resamp->main_buffer);
740 static void sinc_init_table_kaiser(rarch_sinc_resampler_t *resamp,
742 float *phase_table, int phases, int taps, bool calculate_delta)
745 /* Kaiser window function - need to normalize w(0) to 1.0f */
746 float kaiser_beta = resamp->kaiser_beta;
747 double window_mod = besseli0(kaiser_beta);
748 int stride = calculate_delta ? 2 : 1;
749 double sidelobes = taps / 2.0;
751 for (i = 0; i < phases; i++)
753 for (j = 0; j < taps; j++)
757 int n = j * phases + i;
758 double window_phase = (double)n / (phases * taps); /* [0, 1). */
759 window_phase = 2.0 * window_phase - 1.0; /* [-1, 1) */
760 sinc_phase = sidelobes * window_phase;
761 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
762 besseli0(kaiser_beta * sqrtf(1 - window_phase * window_phase))
764 phase_table[i * stride * taps + j] = val;
773 for (p = 0; p < phases - 1; p++)
775 for (j = 0; j < taps; j++)
777 float delta = phase_table[(p + 1) * stride * taps + j] -
778 phase_table[p * stride * taps + j];
779 phase_table[(p * stride + 1) * taps + j] = delta;
784 for (j = 0; j < taps; j++)
788 int n = j * phases + (phase + 1);
789 double window_phase = (double)n / (phases * taps); /* (0, 1]. */
790 window_phase = 2.0 * window_phase - 1.0; /* (-1, 1] */
791 sinc_phase = sidelobes * window_phase;
793 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
794 besseli0(resamp->kaiser_beta * sqrtf(1 - window_phase *
795 window_phase)) / window_mod;
796 delta = (val - phase_table[phase * stride * taps + j]);
797 phase_table[(phase * stride + 1) * taps + j] = delta;
802 static void sinc_init_table_lanczos(
803 rarch_sinc_resampler_t *resamp, double cutoff,
804 float *phase_table, int phases, int taps, bool calculate_delta)
807 /* Lanczos window function - need to normalize w(0) to 1.0f */
808 double window_mod = 1.0;
809 int stride = calculate_delta ? 2 : 1;
810 double sidelobes = taps / 2.0;
812 for (i = 0; i < phases; i++)
814 for (j = 0; j < taps; j++)
818 int n = j * phases + i;
819 double window_phase = (double)n / (phases * taps); /* [0, 1). */
820 window_phase = 2.0 * window_phase - 1.0; /* [-1, 1) */
821 sinc_phase = sidelobes * window_phase;
822 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
823 sinc(M_PI * window_phase) / window_mod;
824 phase_table[i * stride * taps + j] = val;
833 for (p = 0; p < phases - 1; p++)
835 for (j = 0; j < taps; j++)
837 float delta = phase_table[(p + 1) * stride * taps + j] -
838 phase_table[p * stride * taps + j];
839 phase_table[(p * stride + 1) * taps + j] = delta;
844 for (j = 0; j < taps; j++)
848 int n = j * phases + (phase + 1);
849 double window_phase = (double)n / (phases * taps); /* (0, 1]. */
850 window_phase = 2.0 * window_phase - 1.0; /* (-1, 1] */
851 sinc_phase = sidelobes * window_phase;
853 val = cutoff * sinc(M_PI * sinc_phase * cutoff) *
854 sinc(M_PI * window_phase) / window_mod;
855 delta = (val - phase_table[phase * stride * taps + j]);
856 phase_table[(phase * stride + 1) * taps + j] = delta;
861 static void *resampler_sinc_new(const struct resampler_config *config,
862 double bandwidth_mod, enum resampler_quality quality,
863 resampler_simd_mask_t mask)
866 size_t phase_elems = 0;
868 unsigned enable_avx = 0;
869 unsigned sidelobes = 0;
870 enum sinc_window window_type = SINC_WINDOW_NONE;
871 rarch_sinc_resampler_t *re = (rarch_sinc_resampler_t*)
872 calloc(1, sizeof(*re));
879 case RESAMPLER_QUALITY_LOWEST:
883 re->subphase_bits = 10;
884 window_type = SINC_WINDOW_LANCZOS;
886 case RESAMPLER_QUALITY_LOWER:
890 re->subphase_bits = 10;
891 window_type = SINC_WINDOW_LANCZOS;
893 case RESAMPLER_QUALITY_HIGHER:
897 re->subphase_bits = 14;
898 window_type = SINC_WINDOW_KAISER;
899 re->kaiser_beta = 10.5;
902 case RESAMPLER_QUALITY_HIGHEST:
906 re->subphase_bits = 14;
907 window_type = SINC_WINDOW_KAISER;
908 re->kaiser_beta = 14.5;
911 case RESAMPLER_QUALITY_NORMAL:
912 case RESAMPLER_QUALITY_DONTCARE:
916 re->subphase_bits = 16;
917 window_type = SINC_WINDOW_KAISER;
918 re->kaiser_beta = 5.5;
922 re->subphase_mask = (1 << re->subphase_bits) - 1;
923 re->subphase_mod = 1.0f / (1 << re->subphase_bits);
924 re->taps = sidelobes * 2;
926 /* Downsampling, must lower cutoff, and extend number of
927 * taps accordingly to keep same stopband attenuation. */
928 if (bandwidth_mod < 1.0)
930 cutoff *= bandwidth_mod;
931 re->taps = (unsigned)ceil(re->taps / bandwidth_mod);
934 /* Be SIMD-friendly. */
937 re->taps = (re->taps + 7) & ~7;
941 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
942 re->taps = (re->taps + 7) & ~7;
944 re->taps = (re->taps + 3) & ~3;
948 phase_elems = ((1 << re->phase_bits) * re->taps);
949 if (window_type == SINC_WINDOW_KAISER)
950 phase_elems = phase_elems * 2;
951 elems = phase_elems + 4 * re->taps;
953 re->main_buffer = (float*)memalign_alloc(128, sizeof(float) * elems);
954 if (!re->main_buffer)
957 memset(re->main_buffer, 0, sizeof(float) * elems);
959 re->phase_table = re->main_buffer;
960 re->buffer_l = re->main_buffer + phase_elems;
961 re->buffer_r = re->buffer_l + 2 * re->taps;
965 case SINC_WINDOW_LANCZOS:
966 sinc_init_table_lanczos(re, cutoff, re->phase_table,
967 1 << re->phase_bits, re->taps, false);
969 case SINC_WINDOW_KAISER:
970 sinc_init_table_kaiser(re, cutoff, re->phase_table,
971 1 << re->phase_bits, re->taps, true);
973 case SINC_WINDOW_NONE:
977 sinc_resampler.process = resampler_sinc_process_c;
978 if (window_type == SINC_WINDOW_KAISER)
979 sinc_resampler.process = resampler_sinc_process_c_kaiser;
981 if (mask & RESAMPLER_SIMD_AVX && enable_avx)
984 sinc_resampler.process = resampler_sinc_process_avx;
985 if (window_type == SINC_WINDOW_KAISER)
986 sinc_resampler.process = resampler_sinc_process_avx_kaiser;
989 else if (mask & RESAMPLER_SIMD_SSE)
992 sinc_resampler.process = resampler_sinc_process_sse;
993 if (window_type == SINC_WINDOW_KAISER)
994 sinc_resampler.process = resampler_sinc_process_sse_kaiser;
997 else if (mask & RESAMPLER_SIMD_NEON)
999 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
1000 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
1001 if (window_type != SINC_WINDOW_KAISER)
1002 sinc_resampler.process = resampler_sinc_process_neon;
1004 sinc_resampler.process = resampler_sinc_process_neon;
1005 if (window_type == SINC_WINDOW_KAISER)
1006 sinc_resampler.process = resampler_sinc_process_neon_kaiser;
1014 resampler_sinc_free(re);
1018 retro_resampler_t sinc_resampler = {
1020 resampler_sinc_process_c,
1021 resampler_sinc_free,
1022 RESAMPLER_API_VERSION,