Commit | Line | Data |
---|---|---|
3719602c PC |
1 | /* Copyright (C) 2010-2020 The RetroArch team |
2 | * | |
3 | * --------------------------------------------------------------------------------------- | |
4 | * The following license statement only applies to this file (sinc_resampler.c). | |
5 | * --------------------------------------------------------------------------------------- | |
6 | * | |
7 | * Permission is hereby granted, free of charge, | |
8 | * to any person obtaining a copy of this software and associated documentation files (the "Software"), | |
9 | * to deal in the Software without restriction, including without limitation the rights to | |
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
11 | * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
16 | * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
19 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
21 | */ | |
22 | ||
23 | /* Bog-standard windowed SINC implementation. */ | |
24 | ||
25 | #include <stdint.h> | |
26 | #include <stdlib.h> | |
27 | #include <math.h> | |
28 | #include <string.h> | |
29 | ||
30 | #include <retro_environment.h> | |
31 | #include <retro_inline.h> | |
32 | #include <filters.h> | |
33 | #include <memalign.h> | |
34 | ||
35 | #include <audio/audio_resampler.h> | |
36 | #include <filters.h> | |
37 | ||
38 | #ifdef __SSE__ | |
39 | #include <xmmintrin.h> | |
40 | #endif | |
41 | ||
42 | #if defined(__AVX__) | |
43 | #include <immintrin.h> | |
44 | #endif | |
45 | ||
46 | /* Rough SNR values for upsampling: | |
47 | * LOWEST: 40 dB | |
48 | * LOWER: 55 dB | |
49 | * NORMAL: 70 dB | |
50 | * HIGHER: 110 dB | |
51 | * HIGHEST: 140 dB | |
52 | */ | |
53 | ||
54 | /* TODO, make all this more configurable. */ | |
55 | ||
56 | enum sinc_window | |
57 | { | |
58 | SINC_WINDOW_NONE = 0, | |
59 | SINC_WINDOW_KAISER, | |
60 | SINC_WINDOW_LANCZOS | |
61 | }; | |
62 | ||
63 | /* For the little amount of taps we're using, | |
64 | * SSE1 is faster than AVX for some reason. | |
65 | * AVX code is kept here though as by increasing number | |
66 | * of sinc taps, the AVX code is clearly faster than SSE1. | |
67 | */ | |
68 | ||
69 | typedef struct rarch_sinc_resampler | |
70 | { | |
71 | /* A buffer for phase_table, buffer_l and buffer_r | |
72 | * are created in a single calloc(). | |
73 | * Ensure that we get as good cache locality as we can hope for. */ | |
74 | float *main_buffer; | |
75 | float *phase_table; | |
76 | float *buffer_l; | |
77 | float *buffer_r; | |
78 | unsigned phase_bits; | |
79 | unsigned subphase_bits; | |
80 | unsigned subphase_mask; | |
81 | unsigned taps; | |
82 | unsigned ptr; | |
83 | uint32_t time; | |
84 | float subphase_mod; | |
85 | float kaiser_beta; | |
86 | } rarch_sinc_resampler_t; | |
87 | ||
88 | #if (defined(__ARM_NEON__) || defined(HAVE_NEON)) | |
89 | ||
90 | #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS | |
91 | void process_sinc_neon_asm(float *out, const float *left, | |
92 | const float *right, const float *coeff, unsigned taps); | |
93 | #else | |
94 | #include <arm_neon.h> | |
95 | ||
96 | /* Assumes that taps >= 8, and that taps is a multiple of 8. | |
97 | * Not bothering to reimplement this one for the external .S | |
98 | */ | |
99 | static void resampler_sinc_process_neon_kaiser(void *re_, struct resampler_data *data) | |
100 | { | |
101 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
102 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
103 | uint32_t ratio = phases / data->ratio; | |
104 | const float *input = data->data_in; | |
105 | float *output = data->data_out; | |
106 | size_t frames = data->input_frames; | |
107 | size_t out_frames = 0; | |
108 | unsigned taps = resamp->taps; | |
109 | while (frames) | |
110 | { | |
111 | while (frames && resamp->time >= phases) | |
112 | { | |
113 | /* Push in reverse to make filter more obvious. */ | |
114 | if (!resamp->ptr) | |
115 | resamp->ptr = taps; | |
116 | resamp->ptr--; | |
117 | ||
118 | resamp->buffer_l[resamp->ptr + taps] = | |
119 | resamp->buffer_l[resamp->ptr] = *input++; | |
120 | ||
121 | resamp->buffer_r[resamp->ptr + taps] = | |
122 | resamp->buffer_r[resamp->ptr] = *input++; | |
123 | ||
124 | resamp->time -= phases; | |
125 | frames--; | |
126 | } | |
127 | ||
128 | { | |
129 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
130 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
131 | while (resamp->time < phases) | |
132 | { | |
133 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
134 | const float *phase_table = resamp->phase_table + phase * taps * 2; | |
135 | const float *delta_table = phase_table + taps; | |
136 | float32x4_t delta = vdupq_n_f32((resamp->time & resamp->subphase_mask) * resamp->subphase_mod); | |
137 | int i; | |
138 | float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0}; | |
139 | float32x2_t p3, p4; | |
140 | ||
141 | for (i = 0; i < (int)taps; i += 8) | |
142 | { | |
143 | float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]); | |
144 | float32x4x2_t delta8 = vld2q_f32(&delta_table[i]); | |
145 | float32x4x2_t left8 = vld2q_f32(&buffer_l[i]); | |
146 | float32x4x2_t right8 = vld2q_f32(&buffer_r[i]); | |
147 | ||
148 | coeff8.val[0] = vmlaq_f32(coeff8.val[0], delta8.val[0], delta); | |
149 | coeff8.val[1] = vmlaq_f32(coeff8.val[1], delta8.val[1], delta); | |
150 | ||
151 | p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]); | |
152 | p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]); | |
153 | p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]); | |
154 | p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]); | |
155 | } | |
156 | ||
157 | p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1)); | |
158 | p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2)); | |
159 | vst1_f32(output, vpadd_f32(p3, p4)); | |
160 | output += 2; | |
161 | out_frames++; | |
162 | resamp->time += ratio; | |
163 | } | |
164 | } | |
165 | } | |
166 | ||
167 | data->output_frames = out_frames; | |
168 | } | |
169 | #endif | |
170 | ||
171 | /* Assumes that taps >= 8, and that taps is a multiple of 8. */ | |
172 | static void resampler_sinc_process_neon(void *re_, struct resampler_data *data) | |
173 | { | |
174 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
175 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
176 | uint32_t ratio = phases / data->ratio; | |
177 | const float *input = data->data_in; | |
178 | float *output = data->data_out; | |
179 | size_t frames = data->input_frames; | |
180 | size_t out_frames = 0; | |
181 | unsigned taps = resamp->taps; | |
182 | ||
183 | while (frames) | |
184 | { | |
185 | while (frames && resamp->time >= phases) | |
186 | { | |
187 | /* Push in reverse to make filter more obvious. */ | |
188 | if (!resamp->ptr) | |
189 | resamp->ptr = taps; | |
190 | resamp->ptr--; | |
191 | ||
192 | resamp->buffer_l[resamp->ptr + taps] = | |
193 | resamp->buffer_l[resamp->ptr] = *input++; | |
194 | ||
195 | resamp->buffer_r[resamp->ptr + taps] = | |
196 | resamp->buffer_r[resamp->ptr] = *input++; | |
197 | ||
198 | resamp->time -= phases; | |
199 | frames--; | |
200 | } | |
201 | ||
202 | { | |
203 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
204 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
205 | while (resamp->time < phases) | |
206 | { | |
207 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
208 | const float *phase_table = resamp->phase_table + phase * taps; | |
209 | #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS | |
210 | process_sinc_neon_asm(output, buffer_l, buffer_r, phase_table, taps); | |
211 | #else | |
212 | int i; | |
213 | float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0}; | |
214 | float32x2_t p3, p4; | |
215 | ||
216 | for (i = 0; i < (int)taps; i += 8) | |
217 | { | |
218 | float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]); | |
219 | float32x4x2_t left8 = vld2q_f32(&buffer_l[i]); | |
220 | float32x4x2_t right8 = vld2q_f32(&buffer_r[i]); | |
221 | ||
222 | p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]); | |
223 | p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]); | |
224 | p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]); | |
225 | p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]); | |
226 | } | |
227 | ||
228 | p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1)); | |
229 | p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2)); | |
230 | vst1_f32(output, vpadd_f32(p3, p4)); | |
231 | #endif | |
232 | output += 2; | |
233 | out_frames++; | |
234 | resamp->time += ratio; | |
235 | } | |
236 | } | |
237 | } | |
238 | ||
239 | data->output_frames = out_frames; | |
240 | } | |
241 | #endif | |
242 | ||
243 | #if defined(__AVX__) | |
244 | static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data) | |
245 | { | |
246 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
247 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
248 | ||
249 | uint32_t ratio = phases / data->ratio; | |
250 | const float *input = data->data_in; | |
251 | float *output = data->data_out; | |
252 | size_t frames = data->input_frames; | |
253 | size_t out_frames = 0; | |
254 | unsigned taps = resamp->taps; | |
255 | ||
256 | { | |
257 | while (frames) | |
258 | { | |
259 | while (frames && resamp->time >= phases) | |
260 | { | |
261 | /* Push in reverse to make filter more obvious. */ | |
262 | if (!resamp->ptr) | |
263 | resamp->ptr = taps; | |
264 | resamp->ptr--; | |
265 | ||
266 | resamp->buffer_l[resamp->ptr + taps] = | |
267 | resamp->buffer_l[resamp->ptr] = *input++; | |
268 | ||
269 | resamp->buffer_r[resamp->ptr + taps] = | |
270 | resamp->buffer_r[resamp->ptr] = *input++; | |
271 | ||
272 | resamp->time -= phases; | |
273 | frames--; | |
274 | } | |
275 | ||
276 | { | |
277 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
278 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
279 | while (resamp->time < phases) | |
280 | { | |
281 | int i; | |
282 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
283 | ||
284 | float *phase_table = resamp->phase_table + phase * taps * 2; | |
285 | float *delta_table = phase_table + taps; | |
286 | __m256 delta = _mm256_set1_ps((float) | |
287 | (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); | |
288 | ||
289 | __m256 sum_l = _mm256_setzero_ps(); | |
290 | __m256 sum_r = _mm256_setzero_ps(); | |
291 | ||
292 | for (i = 0; i < (int)taps; i += 8) | |
293 | { | |
294 | __m256 buf_l = _mm256_loadu_ps(buffer_l + i); | |
295 | __m256 buf_r = _mm256_loadu_ps(buffer_r + i); | |
296 | __m256 deltas = _mm256_load_ps(delta_table + i); | |
297 | __m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i), | |
298 | _mm256_mul_ps(deltas, delta)); | |
299 | ||
300 | sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); | |
301 | sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); | |
302 | } | |
303 | ||
304 | /* hadd on AVX is weird, and acts on low-lanes | |
305 | * and high-lanes separately. */ | |
306 | __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); | |
307 | __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); | |
308 | res_l = _mm256_hadd_ps(res_l, res_l); | |
309 | res_r = _mm256_hadd_ps(res_r, res_r); | |
310 | res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); | |
311 | res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); | |
312 | ||
313 | /* This is optimized to mov %xmmN, [mem]. | |
314 | * There doesn't seem to be any _mm256_store_ss intrinsic. */ | |
315 | _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); | |
316 | _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); | |
317 | ||
318 | output += 2; | |
319 | out_frames++; | |
320 | resamp->time += ratio; | |
321 | } | |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | data->output_frames = out_frames; | |
327 | } | |
328 | ||
329 | static void resampler_sinc_process_avx(void *re_, struct resampler_data *data) | |
330 | { | |
331 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
332 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
333 | ||
334 | uint32_t ratio = phases / data->ratio; | |
335 | const float *input = data->data_in; | |
336 | float *output = data->data_out; | |
337 | size_t frames = data->input_frames; | |
338 | size_t out_frames = 0; | |
339 | unsigned taps = resamp->taps; | |
340 | ||
341 | { | |
342 | while (frames) | |
343 | { | |
344 | while (frames && resamp->time >= phases) | |
345 | { | |
346 | /* Push in reverse to make filter more obvious. */ | |
347 | if (!resamp->ptr) | |
348 | resamp->ptr = taps; | |
349 | resamp->ptr--; | |
350 | ||
351 | resamp->buffer_l[resamp->ptr + taps] = | |
352 | resamp->buffer_l[resamp->ptr] = *input++; | |
353 | ||
354 | resamp->buffer_r[resamp->ptr + taps] = | |
355 | resamp->buffer_r[resamp->ptr] = *input++; | |
356 | ||
357 | resamp->time -= phases; | |
358 | frames--; | |
359 | } | |
360 | ||
361 | { | |
362 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
363 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
364 | while (resamp->time < phases) | |
365 | { | |
366 | int i; | |
367 | __m256 delta; | |
368 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
369 | float *phase_table = resamp->phase_table + phase * taps; | |
370 | ||
371 | __m256 sum_l = _mm256_setzero_ps(); | |
372 | __m256 sum_r = _mm256_setzero_ps(); | |
373 | ||
374 | for (i = 0; i < (int)taps; i += 8) | |
375 | { | |
376 | __m256 buf_l = _mm256_loadu_ps(buffer_l + i); | |
377 | __m256 buf_r = _mm256_loadu_ps(buffer_r + i); | |
378 | __m256 sinc = _mm256_load_ps((const float*)phase_table + i); | |
379 | ||
380 | sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); | |
381 | sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); | |
382 | } | |
383 | ||
384 | /* hadd on AVX is weird, and acts on low-lanes | |
385 | * and high-lanes separately. */ | |
386 | __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); | |
387 | __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); | |
388 | res_l = _mm256_hadd_ps(res_l, res_l); | |
389 | res_r = _mm256_hadd_ps(res_r, res_r); | |
390 | res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); | |
391 | res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); | |
392 | ||
393 | /* This is optimized to mov %xmmN, [mem]. | |
394 | * There doesn't seem to be any _mm256_store_ss intrinsic. */ | |
395 | _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); | |
396 | _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); | |
397 | ||
398 | output += 2; | |
399 | out_frames++; | |
400 | resamp->time += ratio; | |
401 | } | |
402 | } | |
403 | } | |
404 | } | |
405 | ||
406 | data->output_frames = out_frames; | |
407 | } | |
408 | #endif | |
409 | ||
410 | #if defined(__SSE__) | |
411 | static void resampler_sinc_process_sse_kaiser(void *re_, struct resampler_data *data) | |
412 | { | |
413 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
414 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
415 | ||
416 | uint32_t ratio = phases / data->ratio; | |
417 | const float *input = data->data_in; | |
418 | float *output = data->data_out; | |
419 | size_t frames = data->input_frames; | |
420 | size_t out_frames = 0; | |
421 | unsigned taps = resamp->taps; | |
422 | ||
423 | { | |
424 | while (frames) | |
425 | { | |
426 | while (frames && resamp->time >= phases) | |
427 | { | |
428 | /* Push in reverse to make filter more obvious. */ | |
429 | if (!resamp->ptr) | |
430 | resamp->ptr = taps; | |
431 | resamp->ptr--; | |
432 | ||
433 | resamp->buffer_l[resamp->ptr + taps] = | |
434 | resamp->buffer_l[resamp->ptr] = *input++; | |
435 | ||
436 | resamp->buffer_r[resamp->ptr + taps] = | |
437 | resamp->buffer_r[resamp->ptr] = *input++; | |
438 | ||
439 | resamp->time -= phases; | |
440 | frames--; | |
441 | } | |
442 | ||
443 | { | |
444 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
445 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
446 | while (resamp->time < phases) | |
447 | { | |
448 | int i; | |
449 | __m128 sum; | |
450 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
451 | float *phase_table = resamp->phase_table + phase * taps * 2; | |
452 | float *delta_table = phase_table + taps; | |
453 | __m128 delta = _mm_set1_ps((float) | |
454 | (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); | |
455 | ||
456 | __m128 sum_l = _mm_setzero_ps(); | |
457 | __m128 sum_r = _mm_setzero_ps(); | |
458 | ||
459 | for (i = 0; i < (int)taps; i += 4) | |
460 | { | |
461 | __m128 buf_l = _mm_loadu_ps(buffer_l + i); | |
462 | __m128 buf_r = _mm_loadu_ps(buffer_r + i); | |
463 | __m128 deltas = _mm_load_ps(delta_table + i); | |
464 | __m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i), | |
465 | _mm_mul_ps(deltas, delta)); | |
466 | sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); | |
467 | sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); | |
468 | } | |
469 | ||
470 | /* Them annoying shuffles. | |
471 | * sum_l = { l3, l2, l1, l0 } | |
472 | * sum_r = { r3, r2, r1, r0 } | |
473 | */ | |
474 | ||
475 | sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, | |
476 | _MM_SHUFFLE(1, 0, 1, 0)), | |
477 | _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); | |
478 | ||
479 | /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } | |
480 | * sum = { R1, R0, L1, L0 } | |
481 | */ | |
482 | ||
483 | sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); | |
484 | ||
485 | /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } | |
486 | * sum = { X, R, X, L } | |
487 | */ | |
488 | ||
489 | /* Store L */ | |
490 | _mm_store_ss(output + 0, sum); | |
491 | ||
492 | /* movehl { X, R, X, L } == { X, R, X, R } */ | |
493 | _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); | |
494 | ||
495 | output += 2; | |
496 | out_frames++; | |
497 | resamp->time += ratio; | |
498 | } | |
499 | } | |
500 | } | |
501 | } | |
502 | ||
503 | data->output_frames = out_frames; | |
504 | } | |
505 | ||
506 | static void resampler_sinc_process_sse(void *re_, struct resampler_data *data) | |
507 | { | |
508 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
509 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
510 | ||
511 | uint32_t ratio = phases / data->ratio; | |
512 | const float *input = data->data_in; | |
513 | float *output = data->data_out; | |
514 | size_t frames = data->input_frames; | |
515 | size_t out_frames = 0; | |
516 | unsigned taps = resamp->taps; | |
517 | ||
518 | { | |
519 | while (frames) | |
520 | { | |
521 | while (frames && resamp->time >= phases) | |
522 | { | |
523 | /* Push in reverse to make filter more obvious. */ | |
524 | if (!resamp->ptr) | |
525 | resamp->ptr = taps; | |
526 | resamp->ptr--; | |
527 | ||
528 | resamp->buffer_l[resamp->ptr + taps] = | |
529 | resamp->buffer_l[resamp->ptr] = *input++; | |
530 | ||
531 | resamp->buffer_r[resamp->ptr + taps] = | |
532 | resamp->buffer_r[resamp->ptr] = *input++; | |
533 | ||
534 | resamp->time -= phases; | |
535 | frames--; | |
536 | } | |
537 | ||
538 | { | |
539 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
540 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
541 | while (resamp->time < phases) | |
542 | { | |
543 | int i; | |
544 | __m128 sum; | |
545 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
546 | float *phase_table = resamp->phase_table + phase * taps; | |
547 | ||
548 | __m128 sum_l = _mm_setzero_ps(); | |
549 | __m128 sum_r = _mm_setzero_ps(); | |
550 | ||
551 | for (i = 0; i < (int)taps; i += 4) | |
552 | { | |
553 | __m128 buf_l = _mm_loadu_ps(buffer_l + i); | |
554 | __m128 buf_r = _mm_loadu_ps(buffer_r + i); | |
555 | __m128 _sinc = _mm_load_ps((const float*)phase_table + i); | |
556 | sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); | |
557 | sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); | |
558 | } | |
559 | ||
560 | /* Them annoying shuffles. | |
561 | * sum_l = { l3, l2, l1, l0 } | |
562 | * sum_r = { r3, r2, r1, r0 } | |
563 | */ | |
564 | ||
565 | sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, | |
566 | _MM_SHUFFLE(1, 0, 1, 0)), | |
567 | _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); | |
568 | ||
569 | /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } | |
570 | * sum = { R1, R0, L1, L0 } | |
571 | */ | |
572 | ||
573 | sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); | |
574 | ||
575 | /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } | |
576 | * sum = { X, R, X, L } | |
577 | */ | |
578 | ||
579 | /* Store L */ | |
580 | _mm_store_ss(output + 0, sum); | |
581 | ||
582 | /* movehl { X, R, X, L } == { X, R, X, R } */ | |
583 | _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); | |
584 | ||
585 | output += 2; | |
586 | out_frames++; | |
587 | resamp->time += ratio; | |
588 | } | |
589 | } | |
590 | } | |
591 | } | |
592 | ||
593 | data->output_frames = out_frames; | |
594 | } | |
595 | #endif | |
596 | ||
597 | static void resampler_sinc_process_c_kaiser(void *re_, struct resampler_data *data) | |
598 | { | |
599 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
600 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
601 | ||
602 | uint32_t ratio = phases / data->ratio; | |
603 | const float *input = data->data_in; | |
604 | float *output = data->data_out; | |
605 | size_t frames = data->input_frames; | |
606 | size_t out_frames = 0; | |
607 | unsigned taps = resamp->taps; | |
608 | ||
609 | { | |
610 | while (frames) | |
611 | { | |
612 | while (frames && resamp->time >= phases) | |
613 | { | |
614 | /* Push in reverse to make filter more obvious. */ | |
615 | if (!resamp->ptr) | |
616 | resamp->ptr = taps; | |
617 | resamp->ptr--; | |
618 | ||
619 | resamp->buffer_l[resamp->ptr + taps] = | |
620 | resamp->buffer_l[resamp->ptr] = *input++; | |
621 | ||
622 | resamp->buffer_r[resamp->ptr + taps] = | |
623 | resamp->buffer_r[resamp->ptr] = *input++; | |
624 | ||
625 | resamp->time -= phases; | |
626 | frames--; | |
627 | } | |
628 | ||
629 | { | |
630 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
631 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
632 | while (resamp->time < phases) | |
633 | { | |
634 | int i; | |
635 | float sum_l = 0.0f; | |
636 | float sum_r = 0.0f; | |
637 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
638 | float *phase_table = resamp->phase_table + phase * taps * 2; | |
639 | float *delta_table = phase_table + taps; | |
640 | float delta = (float) | |
641 | (resamp->time & resamp->subphase_mask) * resamp->subphase_mod; | |
642 | ||
643 | for (i = 0; i < (int)taps; i++) | |
644 | { | |
645 | float sinc_val = phase_table[i] + delta_table[i] * delta; | |
646 | ||
647 | sum_l += buffer_l[i] * sinc_val; | |
648 | sum_r += buffer_r[i] * sinc_val; | |
649 | } | |
650 | ||
651 | output[0] = sum_l; | |
652 | output[1] = sum_r; | |
653 | ||
654 | output += 2; | |
655 | out_frames++; | |
656 | resamp->time += ratio; | |
657 | } | |
658 | } | |
659 | ||
660 | } | |
661 | } | |
662 | ||
663 | data->output_frames = out_frames; | |
664 | } | |
665 | ||
666 | static void resampler_sinc_process_c(void *re_, struct resampler_data *data) | |
667 | { | |
668 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; | |
669 | unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); | |
670 | ||
671 | uint32_t ratio = phases / data->ratio; | |
672 | const float *input = data->data_in; | |
673 | float *output = data->data_out; | |
674 | size_t frames = data->input_frames; | |
675 | size_t out_frames = 0; | |
676 | unsigned taps = resamp->taps; | |
677 | ||
678 | { | |
679 | while (frames) | |
680 | { | |
681 | while (frames && resamp->time >= phases) | |
682 | { | |
683 | /* Push in reverse to make filter more obvious. */ | |
684 | if (!resamp->ptr) | |
685 | resamp->ptr = taps; | |
686 | resamp->ptr--; | |
687 | ||
688 | resamp->buffer_l[resamp->ptr + taps] = | |
689 | resamp->buffer_l[resamp->ptr] = *input++; | |
690 | ||
691 | resamp->buffer_r[resamp->ptr + taps] = | |
692 | resamp->buffer_r[resamp->ptr] = *input++; | |
693 | ||
694 | resamp->time -= phases; | |
695 | frames--; | |
696 | } | |
697 | ||
698 | { | |
699 | const float *buffer_l = resamp->buffer_l + resamp->ptr; | |
700 | const float *buffer_r = resamp->buffer_r + resamp->ptr; | |
701 | while (resamp->time < phases) | |
702 | { | |
703 | int i; | |
704 | float sum_l = 0.0f; | |
705 | float sum_r = 0.0f; | |
706 | unsigned phase = resamp->time >> resamp->subphase_bits; | |
707 | float *phase_table = resamp->phase_table + phase * taps; | |
708 | ||
709 | for (i = 0; i < (int)taps; i++) | |
710 | { | |
711 | float sinc_val = phase_table[i]; | |
712 | ||
713 | sum_l += buffer_l[i] * sinc_val; | |
714 | sum_r += buffer_r[i] * sinc_val; | |
715 | } | |
716 | ||
717 | output[0] = sum_l; | |
718 | output[1] = sum_r; | |
719 | ||
720 | output += 2; | |
721 | out_frames++; | |
722 | resamp->time += ratio; | |
723 | } | |
724 | } | |
725 | ||
726 | } | |
727 | } | |
728 | ||
729 | data->output_frames = out_frames; | |
730 | } | |
731 | ||
732 | static void resampler_sinc_free(void *data) | |
733 | { | |
734 | rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)data; | |
735 | if (resamp) | |
736 | memalign_free(resamp->main_buffer); | |
737 | free(resamp); | |
738 | } | |
739 | ||
740 | static void sinc_init_table_kaiser(rarch_sinc_resampler_t *resamp, | |
741 | double cutoff, | |
742 | float *phase_table, int phases, int taps, bool calculate_delta) | |
743 | { | |
744 | int i, j; | |
745 | /* Kaiser window function - need to normalize w(0) to 1.0f */ | |
746 | float kaiser_beta = resamp->kaiser_beta; | |
747 | double window_mod = besseli0(kaiser_beta); | |
748 | int stride = calculate_delta ? 2 : 1; | |
749 | double sidelobes = taps / 2.0; | |
750 | ||
751 | for (i = 0; i < phases; i++) | |
752 | { | |
753 | for (j = 0; j < taps; j++) | |
754 | { | |
755 | float val; | |
756 | double sinc_phase; | |
757 | int n = j * phases + i; | |
758 | double window_phase = (double)n / (phases * taps); /* [0, 1). */ | |
759 | window_phase = 2.0 * window_phase - 1.0; /* [-1, 1) */ | |
760 | sinc_phase = sidelobes * window_phase; | |
761 | val = cutoff * sinc(M_PI * sinc_phase * cutoff) * | |
762 | besseli0(kaiser_beta * sqrtf(1 - window_phase * window_phase)) | |
763 | / window_mod; | |
764 | phase_table[i * stride * taps + j] = val; | |
765 | } | |
766 | } | |
767 | ||
768 | if (calculate_delta) | |
769 | { | |
770 | int phase; | |
771 | int p; | |
772 | ||
773 | for (p = 0; p < phases - 1; p++) | |
774 | { | |
775 | for (j = 0; j < taps; j++) | |
776 | { | |
777 | float delta = phase_table[(p + 1) * stride * taps + j] - | |
778 | phase_table[p * stride * taps + j]; | |
779 | phase_table[(p * stride + 1) * taps + j] = delta; | |
780 | } | |
781 | } | |
782 | ||
783 | phase = phases - 1; | |
784 | for (j = 0; j < taps; j++) | |
785 | { | |
786 | float val, delta; | |
787 | double sinc_phase; | |
788 | int n = j * phases + (phase + 1); | |
789 | double window_phase = (double)n / (phases * taps); /* (0, 1]. */ | |
790 | window_phase = 2.0 * window_phase - 1.0; /* (-1, 1] */ | |
791 | sinc_phase = sidelobes * window_phase; | |
792 | ||
793 | val = cutoff * sinc(M_PI * sinc_phase * cutoff) * | |
794 | besseli0(resamp->kaiser_beta * sqrtf(1 - window_phase * | |
795 | window_phase)) / window_mod; | |
796 | delta = (val - phase_table[phase * stride * taps + j]); | |
797 | phase_table[(phase * stride + 1) * taps + j] = delta; | |
798 | } | |
799 | } | |
800 | } | |
801 | ||
802 | static void sinc_init_table_lanczos( | |
803 | rarch_sinc_resampler_t *resamp, double cutoff, | |
804 | float *phase_table, int phases, int taps, bool calculate_delta) | |
805 | { | |
806 | int i, j; | |
807 | /* Lanczos window function - need to normalize w(0) to 1.0f */ | |
808 | double window_mod = 1.0; | |
809 | int stride = calculate_delta ? 2 : 1; | |
810 | double sidelobes = taps / 2.0; | |
811 | ||
812 | for (i = 0; i < phases; i++) | |
813 | { | |
814 | for (j = 0; j < taps; j++) | |
815 | { | |
816 | double sinc_phase; | |
817 | float val; | |
818 | int n = j * phases + i; | |
819 | double window_phase = (double)n / (phases * taps); /* [0, 1). */ | |
820 | window_phase = 2.0 * window_phase - 1.0; /* [-1, 1) */ | |
821 | sinc_phase = sidelobes * window_phase; | |
822 | val = cutoff * sinc(M_PI * sinc_phase * cutoff) * | |
823 | sinc(M_PI * window_phase) / window_mod; | |
824 | phase_table[i * stride * taps + j] = val; | |
825 | } | |
826 | } | |
827 | ||
828 | if (calculate_delta) | |
829 | { | |
830 | int phase; | |
831 | int p; | |
832 | ||
833 | for (p = 0; p < phases - 1; p++) | |
834 | { | |
835 | for (j = 0; j < taps; j++) | |
836 | { | |
837 | float delta = phase_table[(p + 1) * stride * taps + j] - | |
838 | phase_table[p * stride * taps + j]; | |
839 | phase_table[(p * stride + 1) * taps + j] = delta; | |
840 | } | |
841 | } | |
842 | ||
843 | phase = phases - 1; | |
844 | for (j = 0; j < taps; j++) | |
845 | { | |
846 | float val, delta; | |
847 | double sinc_phase; | |
848 | int n = j * phases + (phase + 1); | |
849 | double window_phase = (double)n / (phases * taps); /* (0, 1]. */ | |
850 | window_phase = 2.0 * window_phase - 1.0; /* (-1, 1] */ | |
851 | sinc_phase = sidelobes * window_phase; | |
852 | ||
853 | val = cutoff * sinc(M_PI * sinc_phase * cutoff) * | |
854 | sinc(M_PI * window_phase) / window_mod; | |
855 | delta = (val - phase_table[phase * stride * taps + j]); | |
856 | phase_table[(phase * stride + 1) * taps + j] = delta; | |
857 | } | |
858 | } | |
859 | } | |
860 | ||
861 | static void *resampler_sinc_new(const struct resampler_config *config, | |
862 | double bandwidth_mod, enum resampler_quality quality, | |
863 | resampler_simd_mask_t mask) | |
864 | { | |
865 | double cutoff = 0.0; | |
866 | size_t phase_elems = 0; | |
867 | size_t elems = 0; | |
868 | unsigned enable_avx = 0; | |
869 | unsigned sidelobes = 0; | |
870 | enum sinc_window window_type = SINC_WINDOW_NONE; | |
871 | rarch_sinc_resampler_t *re = (rarch_sinc_resampler_t*) | |
872 | calloc(1, sizeof(*re)); | |
873 | ||
874 | if (!re) | |
875 | return NULL; | |
876 | ||
877 | switch (quality) | |
878 | { | |
879 | case RESAMPLER_QUALITY_LOWEST: | |
880 | cutoff = 0.98; | |
881 | sidelobes = 2; | |
882 | re->phase_bits = 12; | |
883 | re->subphase_bits = 10; | |
884 | window_type = SINC_WINDOW_LANCZOS; | |
885 | break; | |
886 | case RESAMPLER_QUALITY_LOWER: | |
887 | cutoff = 0.98; | |
888 | sidelobes = 4; | |
889 | re->phase_bits = 12; | |
890 | re->subphase_bits = 10; | |
891 | window_type = SINC_WINDOW_LANCZOS; | |
892 | break; | |
893 | case RESAMPLER_QUALITY_HIGHER: | |
894 | cutoff = 0.90; | |
895 | sidelobes = 32; | |
896 | re->phase_bits = 10; | |
897 | re->subphase_bits = 14; | |
898 | window_type = SINC_WINDOW_KAISER; | |
899 | re->kaiser_beta = 10.5; | |
900 | enable_avx = 1; | |
901 | break; | |
902 | case RESAMPLER_QUALITY_HIGHEST: | |
903 | cutoff = 0.962; | |
904 | sidelobes = 128; | |
905 | re->phase_bits = 10; | |
906 | re->subphase_bits = 14; | |
907 | window_type = SINC_WINDOW_KAISER; | |
908 | re->kaiser_beta = 14.5; | |
909 | enable_avx = 1; | |
910 | break; | |
911 | case RESAMPLER_QUALITY_NORMAL: | |
912 | case RESAMPLER_QUALITY_DONTCARE: | |
913 | cutoff = 0.825; | |
914 | sidelobes = 8; | |
915 | re->phase_bits = 8; | |
916 | re->subphase_bits = 16; | |
917 | window_type = SINC_WINDOW_KAISER; | |
918 | re->kaiser_beta = 5.5; | |
919 | break; | |
920 | } | |
921 | ||
922 | re->subphase_mask = (1 << re->subphase_bits) - 1; | |
923 | re->subphase_mod = 1.0f / (1 << re->subphase_bits); | |
924 | re->taps = sidelobes * 2; | |
925 | ||
926 | /* Downsampling, must lower cutoff, and extend number of | |
927 | * taps accordingly to keep same stopband attenuation. */ | |
928 | if (bandwidth_mod < 1.0) | |
929 | { | |
930 | cutoff *= bandwidth_mod; | |
931 | re->taps = (unsigned)ceil(re->taps / bandwidth_mod); | |
932 | } | |
933 | ||
934 | /* Be SIMD-friendly. */ | |
935 | #if defined(__AVX__) | |
936 | if (enable_avx) | |
937 | re->taps = (re->taps + 7) & ~7; | |
938 | else | |
939 | #endif | |
940 | { | |
941 | #if (defined(__ARM_NEON__) || defined(HAVE_NEON)) | |
942 | re->taps = (re->taps + 7) & ~7; | |
943 | #else | |
944 | re->taps = (re->taps + 3) & ~3; | |
945 | #endif | |
946 | } | |
947 | ||
948 | phase_elems = ((1 << re->phase_bits) * re->taps); | |
949 | if (window_type == SINC_WINDOW_KAISER) | |
950 | phase_elems = phase_elems * 2; | |
951 | elems = phase_elems + 4 * re->taps; | |
952 | ||
953 | re->main_buffer = (float*)memalign_alloc(128, sizeof(float) * elems); | |
954 | if (!re->main_buffer) | |
955 | goto error; | |
956 | ||
957 | memset(re->main_buffer, 0, sizeof(float) * elems); | |
958 | ||
959 | re->phase_table = re->main_buffer; | |
960 | re->buffer_l = re->main_buffer + phase_elems; | |
961 | re->buffer_r = re->buffer_l + 2 * re->taps; | |
962 | ||
963 | switch (window_type) | |
964 | { | |
965 | case SINC_WINDOW_LANCZOS: | |
966 | sinc_init_table_lanczos(re, cutoff, re->phase_table, | |
967 | 1 << re->phase_bits, re->taps, false); | |
968 | break; | |
969 | case SINC_WINDOW_KAISER: | |
970 | sinc_init_table_kaiser(re, cutoff, re->phase_table, | |
971 | 1 << re->phase_bits, re->taps, true); | |
972 | break; | |
973 | case SINC_WINDOW_NONE: | |
974 | goto error; | |
975 | } | |
976 | ||
977 | sinc_resampler.process = resampler_sinc_process_c; | |
978 | if (window_type == SINC_WINDOW_KAISER) | |
979 | sinc_resampler.process = resampler_sinc_process_c_kaiser; | |
980 | ||
981 | if (mask & RESAMPLER_SIMD_AVX && enable_avx) | |
982 | { | |
983 | #if defined(__AVX__) | |
984 | sinc_resampler.process = resampler_sinc_process_avx; | |
985 | if (window_type == SINC_WINDOW_KAISER) | |
986 | sinc_resampler.process = resampler_sinc_process_avx_kaiser; | |
987 | #endif | |
988 | } | |
989 | else if (mask & RESAMPLER_SIMD_SSE) | |
990 | { | |
991 | #if defined(__SSE__) | |
992 | sinc_resampler.process = resampler_sinc_process_sse; | |
993 | if (window_type == SINC_WINDOW_KAISER) | |
994 | sinc_resampler.process = resampler_sinc_process_sse_kaiser; | |
995 | #endif | |
996 | } | |
997 | else if (mask & RESAMPLER_SIMD_NEON) | |
998 | { | |
999 | #if (defined(__ARM_NEON__) || defined(HAVE_NEON)) | |
1000 | #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS | |
1001 | if (window_type != SINC_WINDOW_KAISER) | |
1002 | sinc_resampler.process = resampler_sinc_process_neon; | |
1003 | #else | |
1004 | sinc_resampler.process = resampler_sinc_process_neon; | |
1005 | if (window_type == SINC_WINDOW_KAISER) | |
1006 | sinc_resampler.process = resampler_sinc_process_neon_kaiser; | |
1007 | #endif | |
1008 | #endif | |
1009 | } | |
1010 | ||
1011 | return re; | |
1012 | ||
1013 | error: | |
1014 | resampler_sinc_free(re); | |
1015 | return NULL; | |
1016 | } | |
1017 | ||
1018 | retro_resampler_t sinc_resampler = { | |
1019 | resampler_sinc_new, | |
1020 | resampler_sinc_process_c, | |
1021 | resampler_sinc_free, | |
1022 | RESAMPLER_API_VERSION, | |
1023 | "sinc", | |
1024 | "sinc" | |
1025 | }; |