gpu_neon: fix some missing ebuf updates
[pcsx_rearmed.git] / deps / libretro-common / audio / conversion / float_to_s16.c
CommitLineData
3719602c
PC
1/* Copyright (C) 2010-2021 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (float_to_s16.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22#include <stdint.h>
23#include <stddef.h>
24
25#if defined(__SSE2__)
26#include <emmintrin.h>
27#elif defined(__ALTIVEC__)
28#include <altivec.h>
29#endif
30
31#include <features/features_cpu.h>
32#include <audio/conversion/float_to_s16.h>
33
34#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
35static bool float_to_s16_neon_enabled = false;
36#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
37void convert_float_s16_asm(int16_t *out,
38 const float *in, size_t samples);
39#else
40#include <arm_neon.h>
41#endif
42
43void convert_float_to_s16(int16_t *out,
44 const float *in, size_t samples)
45{
46 size_t i = 0;
47 if (float_to_s16_neon_enabled)
48 {
49 float gf = (1<<15);
50 float32x4_t vgf = {gf, gf, gf, gf};
51 while (samples >= 8)
52 {
53#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
54 size_t aligned_samples = samples & ~7;
55 if (aligned_samples)
56 convert_float_s16_asm(out, in, aligned_samples);
57
58 out += aligned_samples;
59 in += aligned_samples;
60 samples -= aligned_samples;
61 i = 0;
62#else
63 int16x4x2_t oreg;
64 int32x4x2_t creg;
65 float32x4x2_t inreg = vld2q_f32(in);
66 creg.val[0] = vcvtq_s32_f32(vmulq_f32(inreg.val[0], vgf));
67 creg.val[1] = vcvtq_s32_f32(vmulq_f32(inreg.val[1], vgf));
68 oreg.val[0] = vqmovn_s32(creg.val[0]);
69 oreg.val[1] = vqmovn_s32(creg.val[1]);
70 vst2_s16(out, oreg);
71 in += 8;
72 out += 8;
73 samples -= 8;
74#endif
75 }
76 }
77
78 for (; i < samples; i++)
79 {
80 int32_t val = (int32_t)(in[i] * 0x8000);
81 out[i] = (val > 0x7FFF) ? 0x7FFF :
82 (val < -0x8000 ? -0x8000 : (int16_t)val);
83 }
84}
85
86void convert_float_to_s16_init_simd(void)
87{
88 uint64_t cpu = cpu_features_get();
89
90 if (cpu & RETRO_SIMD_NEON)
91 float_to_s16_neon_enabled = true;
92}
93#else
94void convert_float_to_s16(int16_t *out,
95 const float *in, size_t samples)
96{
97 size_t i = 0;
98#if defined(__SSE2__)
99 __m128 factor = _mm_set1_ps((float)0x8000);
100 /* Initialize a 4D vector with 32768.0 for its elements */
101
102 for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
103 { /* Skip forward 8 samples at a time... */
104 __m128 input_a = _mm_loadu_ps(in + 0); /* Create a 4-float vector from the next four samples... */
105 __m128 input_b = _mm_loadu_ps(in + 4); /* ...and another from the *next* next four. */
106 __m128 res_a = _mm_mul_ps(input_a, factor);
107 __m128 res_b = _mm_mul_ps(input_b, factor); /* Multiply these samples by 32768 */
108 __m128i ints_a = _mm_cvtps_epi32(res_a);
109 __m128i ints_b = _mm_cvtps_epi32(res_b); /* Convert the samples to 32-bit integers */
110 __m128i packed = _mm_packs_epi32(ints_a, ints_b); /* Then convert them to 16-bit ints, clamping to [-32768, 32767] */
111
112 _mm_storeu_si128((__m128i *)out, packed); /* Then put the result in the output array */
113 }
114
115 samples = samples - i;
116 i = 0;
117 /* If there are any stray samples at the end, we need to convert them
118 * (maybe the original array didn't contain a multiple of 8 samples) */
119#elif defined(__ALTIVEC__)
120 int samples_in = samples;
121
122 /* Unaligned loads/store is a bit expensive,
123 * so we optimize for the good path (very likely). */
124 if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
125 {
126 size_t i;
127 for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
128 {
129 vector float input0 = vec_ld( 0, in);
130 vector float input1 = vec_ld(16, in);
131 vector signed int result0 = vec_cts(input0, 15);
132 vector signed int result1 = vec_cts(input1, 15);
133 vec_st(vec_packs(result0, result1), 0, out);
134 }
135
136 samples_in -= i;
137 }
138
139 samples = samples_in;
140 i = 0;
141#elif defined(_MIPS_ARCH_ALLEGREX)
142#ifdef DEBUG
143 /* Make sure the buffers are 16 byte aligned, this should be
144 * the default behaviour of malloc in the PSPSDK.
145 * Assume alignment. */
146 retro_assert(((uintptr_t)in & 0xf) == 0);
147 retro_assert(((uintptr_t)out & 0xf) == 0);
148#endif
149
150 for (i = 0; i + 8 <= samples; i += 8)
151 {
152 __asm__ (
153 ".set push \n"
154 ".set noreorder \n"
155
156 "lv.q c100, 0(%0) \n"
157 "lv.q c110, 16(%0) \n"
158
159 "vf2in.q c100, c100, 31 \n"
160 "vf2in.q c110, c110, 31 \n"
161 "vi2s.q c100, c100 \n"
162 "vi2s.q c102, c110 \n"
163
164 "sv.q c100, 0(%1) \n"
165
166 ".set pop \n"
167 :: "r"(in + i), "r"(out + i));
168 }
169#endif
170
171 /* This loop converts stray samples to the right format,
172 * but it's also a fallback in case no SIMD instructions are available. */
173 for (; i < samples; i++)
174 {
175 int32_t val = (int32_t)(in[i] * 0x8000);
176 out[i] = (val > 0x7FFF)
177 ? 0x7FFF
178 : (val < -0x8000 ? -0x8000 : (int16_t)val);
179 }
180}
181
182void convert_float_to_s16_init_simd(void) { }
183#endif