Commit | Line | Data |
---|---|---|
3719602c PC |
1 | /* Copyright (C) 2010-2021 The RetroArch team |
2 | * | |
3 | * --------------------------------------------------------------------------------------- | |
4 | * The following license statement only applies to this file (float_to_s16.c). | |
5 | * --------------------------------------------------------------------------------------- | |
6 | * | |
7 | * Permission is hereby granted, free of charge, | |
8 | * to any person obtaining a copy of this software and associated documentation files (the "Software"), | |
9 | * to deal in the Software without restriction, including without limitation the rights to | |
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
11 | * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
16 | * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
19 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
21 | */ | |
22 | #include <stdint.h> | |
23 | #include <stddef.h> | |
24 | ||
25 | #if defined(__SSE2__) | |
26 | #include <emmintrin.h> | |
27 | #elif defined(__ALTIVEC__) | |
28 | #include <altivec.h> | |
29 | #endif | |
30 | ||
31 | #include <features/features_cpu.h> | |
32 | #include <audio/conversion/float_to_s16.h> | |
33 | ||
34 | #if (defined(__ARM_NEON__) || defined(HAVE_NEON)) | |
35 | static bool float_to_s16_neon_enabled = false; | |
36 | #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS | |
37 | void convert_float_s16_asm(int16_t *out, | |
38 | const float *in, size_t samples); | |
39 | #else | |
40 | #include <arm_neon.h> | |
41 | #endif | |
42 | ||
43 | void convert_float_to_s16(int16_t *out, | |
44 | const float *in, size_t samples) | |
45 | { | |
46 | size_t i = 0; | |
47 | if (float_to_s16_neon_enabled) | |
48 | { | |
49 | float gf = (1<<15); | |
50 | float32x4_t vgf = {gf, gf, gf, gf}; | |
51 | while (samples >= 8) | |
52 | { | |
53 | #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS | |
54 | size_t aligned_samples = samples & ~7; | |
55 | if (aligned_samples) | |
56 | convert_float_s16_asm(out, in, aligned_samples); | |
57 | ||
58 | out += aligned_samples; | |
59 | in += aligned_samples; | |
60 | samples -= aligned_samples; | |
61 | i = 0; | |
62 | #else | |
63 | int16x4x2_t oreg; | |
64 | int32x4x2_t creg; | |
65 | float32x4x2_t inreg = vld2q_f32(in); | |
66 | creg.val[0] = vcvtq_s32_f32(vmulq_f32(inreg.val[0], vgf)); | |
67 | creg.val[1] = vcvtq_s32_f32(vmulq_f32(inreg.val[1], vgf)); | |
68 | oreg.val[0] = vqmovn_s32(creg.val[0]); | |
69 | oreg.val[1] = vqmovn_s32(creg.val[1]); | |
70 | vst2_s16(out, oreg); | |
71 | in += 8; | |
72 | out += 8; | |
73 | samples -= 8; | |
74 | #endif | |
75 | } | |
76 | } | |
77 | ||
78 | for (; i < samples; i++) | |
79 | { | |
80 | int32_t val = (int32_t)(in[i] * 0x8000); | |
81 | out[i] = (val > 0x7FFF) ? 0x7FFF : | |
82 | (val < -0x8000 ? -0x8000 : (int16_t)val); | |
83 | } | |
84 | } | |
85 | ||
86 | void convert_float_to_s16_init_simd(void) | |
87 | { | |
88 | uint64_t cpu = cpu_features_get(); | |
89 | ||
90 | if (cpu & RETRO_SIMD_NEON) | |
91 | float_to_s16_neon_enabled = true; | |
92 | } | |
93 | #else | |
94 | void convert_float_to_s16(int16_t *out, | |
95 | const float *in, size_t samples) | |
96 | { | |
97 | size_t i = 0; | |
98 | #if defined(__SSE2__) | |
99 | __m128 factor = _mm_set1_ps((float)0x8000); | |
100 | /* Initialize a 4D vector with 32768.0 for its elements */ | |
101 | ||
102 | for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) | |
103 | { /* Skip forward 8 samples at a time... */ | |
104 | __m128 input_a = _mm_loadu_ps(in + 0); /* Create a 4-float vector from the next four samples... */ | |
105 | __m128 input_b = _mm_loadu_ps(in + 4); /* ...and another from the *next* next four. */ | |
106 | __m128 res_a = _mm_mul_ps(input_a, factor); | |
107 | __m128 res_b = _mm_mul_ps(input_b, factor); /* Multiply these samples by 32768 */ | |
108 | __m128i ints_a = _mm_cvtps_epi32(res_a); | |
109 | __m128i ints_b = _mm_cvtps_epi32(res_b); /* Convert the samples to 32-bit integers */ | |
110 | __m128i packed = _mm_packs_epi32(ints_a, ints_b); /* Then convert them to 16-bit ints, clamping to [-32768, 32767] */ | |
111 | ||
112 | _mm_storeu_si128((__m128i *)out, packed); /* Then put the result in the output array */ | |
113 | } | |
114 | ||
115 | samples = samples - i; | |
116 | i = 0; | |
117 | /* If there are any stray samples at the end, we need to convert them | |
118 | * (maybe the original array didn't contain a multiple of 8 samples) */ | |
119 | #elif defined(__ALTIVEC__) | |
120 | int samples_in = samples; | |
121 | ||
122 | /* Unaligned loads/store is a bit expensive, | |
123 | * so we optimize for the good path (very likely). */ | |
124 | if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0) | |
125 | { | |
126 | size_t i; | |
127 | for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) | |
128 | { | |
129 | vector float input0 = vec_ld( 0, in); | |
130 | vector float input1 = vec_ld(16, in); | |
131 | vector signed int result0 = vec_cts(input0, 15); | |
132 | vector signed int result1 = vec_cts(input1, 15); | |
133 | vec_st(vec_packs(result0, result1), 0, out); | |
134 | } | |
135 | ||
136 | samples_in -= i; | |
137 | } | |
138 | ||
139 | samples = samples_in; | |
140 | i = 0; | |
141 | #elif defined(_MIPS_ARCH_ALLEGREX) | |
142 | #ifdef DEBUG | |
143 | /* Make sure the buffers are 16 byte aligned, this should be | |
144 | * the default behaviour of malloc in the PSPSDK. | |
145 | * Assume alignment. */ | |
146 | retro_assert(((uintptr_t)in & 0xf) == 0); | |
147 | retro_assert(((uintptr_t)out & 0xf) == 0); | |
148 | #endif | |
149 | ||
150 | for (i = 0; i + 8 <= samples; i += 8) | |
151 | { | |
152 | __asm__ ( | |
153 | ".set push \n" | |
154 | ".set noreorder \n" | |
155 | ||
156 | "lv.q c100, 0(%0) \n" | |
157 | "lv.q c110, 16(%0) \n" | |
158 | ||
159 | "vf2in.q c100, c100, 31 \n" | |
160 | "vf2in.q c110, c110, 31 \n" | |
161 | "vi2s.q c100, c100 \n" | |
162 | "vi2s.q c102, c110 \n" | |
163 | ||
164 | "sv.q c100, 0(%1) \n" | |
165 | ||
166 | ".set pop \n" | |
167 | :: "r"(in + i), "r"(out + i)); | |
168 | } | |
169 | #endif | |
170 | ||
171 | /* This loop converts stray samples to the right format, | |
172 | * but it's also a fallback in case no SIMD instructions are available. */ | |
173 | for (; i < samples; i++) | |
174 | { | |
175 | int32_t val = (int32_t)(in[i] * 0x8000); | |
176 | out[i] = (val > 0x7FFF) | |
177 | ? 0x7FFF | |
178 | : (val < -0x8000 ? -0x8000 : (int16_t)val); | |
179 | } | |
180 | } | |
181 | ||
182 | void convert_float_to_s16_init_simd(void) { } | |
183 | #endif |