[pcsx_rearmed.git] / deps / libretro-common / audio / conversion / float_to_s16.c

/* Copyright  (C) 2010-2021 The RetroArch team
 *
 * ---------------------------------------------------------------------------------------
 * The following license statement only applies to this file (float_to_s16.c).
 * ---------------------------------------------------------------------------------------
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
#include <stdint.h>
#include <stddef.h>

#if defined(__SSE2__)
#include <emmintrin.h>
#elif defined(__ALTIVEC__)
#include <altivec.h>
#endif

#include <features/features_cpu.h>
#include <audio/conversion/float_to_s16.h>

#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
static bool float_to_s16_neon_enabled = false;
#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
void convert_float_s16_asm(int16_t *out,
      const float *in, size_t samples);
#else
#include <arm_neon.h>
#endif

void convert_float_to_s16(int16_t *out,
      const float *in, size_t samples)
{
   size_t i           = 0;
   if (float_to_s16_neon_enabled)
   {
      float        gf = (1<<15);
      float32x4_t vgf = {gf, gf, gf, gf};
      while (samples >= 8)
      {
#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
         size_t aligned_samples = samples & ~7;
         if (aligned_samples)
            convert_float_s16_asm(out, in, aligned_samples);

         out                += aligned_samples;
         in                 += aligned_samples;
         samples            -= aligned_samples;
         i                   = 0;
#else
         int16x4x2_t oreg;
         int32x4x2_t creg;
         float32x4x2_t inreg = vld2q_f32(in);
         creg.val[0]         = vcvtq_s32_f32(vmulq_f32(inreg.val[0], vgf));
         creg.val[1]         = vcvtq_s32_f32(vmulq_f32(inreg.val[1], vgf));
         oreg.val[0]         = vqmovn_s32(creg.val[0]);
         oreg.val[1]         = vqmovn_s32(creg.val[1]);
         vst2_s16(out, oreg);
         in                 += 8;
         out                += 8;
         samples            -= 8;
#endif
      }
   }

   for (; i < samples; i++)
   {
      int32_t val = (int32_t)(in[i] * 0x8000);
      out[i]      = (val > 0x7FFF) ? 0x7FFF :
         (val < -0x8000 ? -0x8000 : (int16_t)val);
   }
}

void convert_float_to_s16_init_simd(void)
{
   uint64_t cpu = cpu_features_get();

   if (cpu & RETRO_SIMD_NEON)
      float_to_s16_neon_enabled = true;
}
#else
void convert_float_to_s16(int16_t *out,
      const float *in, size_t samples)
{
   size_t i          = 0;
#if defined(__SSE2__)
   __m128 factor     = _mm_set1_ps((float)0x8000);
   /* Initialize a 4D vector with 32768.0 for its elements */

   for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
   { /* Skip forward 8 samples at a time... */
      __m128 input_a = _mm_loadu_ps(in + 0); /* Create a 4-float vector from the next four samples... */
      __m128 input_b = _mm_loadu_ps(in + 4); /* ...and another from the *next* next four. */
      __m128 res_a   = _mm_mul_ps(input_a, factor);
      __m128 res_b   = _mm_mul_ps(input_b, factor); /* Multiply these samples by 32768 */
      __m128i ints_a = _mm_cvtps_epi32(res_a);
      __m128i ints_b = _mm_cvtps_epi32(res_b); /* Convert the samples to 32-bit integers */
      __m128i packed = _mm_packs_epi32(ints_a, ints_b); /* Then convert them to 16-bit ints, clamping to [-32768, 32767] */

      _mm_storeu_si128((__m128i *)out, packed); /* Then put the result in the output array */
   }

   samples           = samples - i;
   i                 = 0;
   /* If there are any stray samples at the end, we need to convert them
    * (maybe the original array didn't contain a multiple of 8 samples) */
#elif defined(__ALTIVEC__)
   int samples_in    = samples;

   /* Unaligned loads/store is a bit expensive,
    * so we optimize for the good path (very likely). */
   if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
   {
      size_t i;
      for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
      {
         vector float       input0 = vec_ld( 0, in);
         vector float       input1 = vec_ld(16, in);
         vector signed int result0 = vec_cts(input0, 15);
         vector signed int result1 = vec_cts(input1, 15);
         vec_st(vec_packs(result0, result1), 0, out);
      }

      samples_in    -= i;
   }

   samples           = samples_in;
   i                 = 0;
#elif defined(_MIPS_ARCH_ALLEGREX)
#ifdef DEBUG
   /* Make sure the buffers are 16 byte aligned, this should be
    * the default behaviour of malloc in the PSPSDK.
    * Assume alignment. */
   retro_assert(((uintptr_t)in  & 0xf) == 0);
   retro_assert(((uintptr_t)out & 0xf) == 0);
#endif

   for (i = 0; i + 8 <= samples; i += 8)
   {
      __asm__ (
            ".set    push                 \n"
            ".set    noreorder            \n"

            "lv.q    c100,  0(%0)         \n"
            "lv.q    c110,  16(%0)        \n"

            "vf2in.q c100, c100, 31       \n"
            "vf2in.q c110, c110, 31       \n"
            "vi2s.q  c100, c100           \n"
            "vi2s.q  c102, c110           \n"

            "sv.q    c100,  0(%1)         \n"

            ".set    pop                  \n"
            :: "r"(in + i), "r"(out + i));
   }
#endif

   /* This loop converts stray samples to the right format,
    * but it's also a fallback in case no SIMD instructions are available. */
   for (; i < samples; i++)
   {
      int32_t val    = (int32_t)(in[i] * 0x8000);
      out[i]         = (val > 0x7FFF) 
         ? 0x7FFF 
         : (val < -0x8000 ? -0x8000 : (int16_t)val);
   }
}

void convert_float_to_s16_init_simd(void) { }
#endif
Commit	Line	Data
	1	/* Copyright (C) 2010-2021 The RetroArch team
	2	*
	3	* ---------------------------------------------------------------------------------------
	4	* The following license statement only applies to this file (float_to_s16.c).
	5	* ---------------------------------------------------------------------------------------
	6	*
	7	* Permission is hereby granted, free of charge,
	8	* to any person obtaining a copy of this software and associated documentation files (the "Software"),
	9	* to deal in the Software without restriction, including without limitation the rights to
	10	* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
	11	* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
	14	*
	15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
	16	* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	18	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	19	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	20	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	21	*/
	22	#include <stdint.h>
	23	#include <stddef.h>
	24
	25	#if defined(__SSE2__)
	26	#include <emmintrin.h>
	27	#elif defined(__ALTIVEC__)
	28	#include <altivec.h>
	29	#endif
	30
	31	#include <features/features_cpu.h>
	32	#include <audio/conversion/float_to_s16.h>
	33
	34	#if (defined(__ARM_NEON__) \|\| defined(HAVE_NEON))
	35	static bool float_to_s16_neon_enabled = false;
	36	#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
	37	void convert_float_s16_asm(int16_t *out,
	38	const float *in, size_t samples);
	39	#else
	40	#include <arm_neon.h>
	41	#endif
	42
	43	void convert_float_to_s16(int16_t *out,
	44	const float *in, size_t samples)
	45	{
	46	size_t i = 0;
	47	if (float_to_s16_neon_enabled)
	48	{
	49	float gf = (1<<15);
	50	float32x4_t vgf = {gf, gf, gf, gf};
	51	while (samples >= 8)
	52	{
	53	#ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
	54	size_t aligned_samples = samples & ~7;
	55	if (aligned_samples)
	56	convert_float_s16_asm(out, in, aligned_samples);
	57
	58	out += aligned_samples;
	59	in += aligned_samples;
	60	samples -= aligned_samples;
	61	i = 0;
	62	#else
	63	int16x4x2_t oreg;
	64	int32x4x2_t creg;
	65	float32x4x2_t inreg = vld2q_f32(in);
	66	creg.val[0] = vcvtq_s32_f32(vmulq_f32(inreg.val[0], vgf));
	67	creg.val[1] = vcvtq_s32_f32(vmulq_f32(inreg.val[1], vgf));
	68	oreg.val[0] = vqmovn_s32(creg.val[0]);
	69	oreg.val[1] = vqmovn_s32(creg.val[1]);
	70	vst2_s16(out, oreg);
	71	in += 8;
	72	out += 8;
	73	samples -= 8;
	74	#endif
	75	}
	76	}
	77
	78	for (; i < samples; i++)
	79	{
	80	int32_t val = (int32_t)(in[i] * 0x8000);
	81	out[i] = (val > 0x7FFF) ? 0x7FFF :
	82	(val < -0x8000 ? -0x8000 : (int16_t)val);
	83	}
	84	}
	85
	86	void convert_float_to_s16_init_simd(void)
	87	{
	88	uint64_t cpu = cpu_features_get();
	89
	90	if (cpu & RETRO_SIMD_NEON)
	91	float_to_s16_neon_enabled = true;
	92	}
	93	#else
	94	void convert_float_to_s16(int16_t *out,
	95	const float *in, size_t samples)
	96	{
	97	size_t i = 0;
	98	#if defined(__SSE2__)
	99	__m128 factor = _mm_set1_ps((float)0x8000);
	100	/* Initialize a 4D vector with 32768.0 for its elements */
	101
	102	for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
	103	{ /* Skip forward 8 samples at a time... */
	104	__m128 input_a = _mm_loadu_ps(in + 0); /* Create a 4-float vector from the next four samples... */
	105	__m128 input_b = _mm_loadu_ps(in + 4); /* ...and another from the next next four. */
	106	__m128 res_a = _mm_mul_ps(input_a, factor);
	107	__m128 res_b = _mm_mul_ps(input_b, factor); /* Multiply these samples by 32768 */
	108	__m128i ints_a = _mm_cvtps_epi32(res_a);
	109	__m128i ints_b = _mm_cvtps_epi32(res_b); /* Convert the samples to 32-bit integers */
	110	__m128i packed = _mm_packs_epi32(ints_a, ints_b); /* Then convert them to 16-bit ints, clamping to [-32768, 32767] */
	111
	112	_mm_storeu_si128((__m128i )out, packed); / Then put the result in the output array */
	113	}
	114
	115	samples = samples - i;
	116	i = 0;
	117	/* If there are any stray samples at the end, we need to convert them
	118	* (maybe the original array didn't contain a multiple of 8 samples) */
	119	#elif defined(__ALTIVEC__)
	120	int samples_in = samples;
	121
	122	/* Unaligned loads/store is a bit expensive,
	123	* so we optimize for the good path (very likely). */
	124	if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
	125	{
	126	size_t i;
	127	for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
	128	{
	129	vector float input0 = vec_ld( 0, in);
	130	vector float input1 = vec_ld(16, in);
	131	vector signed int result0 = vec_cts(input0, 15);
	132	vector signed int result1 = vec_cts(input1, 15);
	133	vec_st(vec_packs(result0, result1), 0, out);
	134	}
	135
	136	samples_in -= i;
	137	}
	138
	139	samples = samples_in;
	140	i = 0;
	141	#elif defined(_MIPS_ARCH_ALLEGREX)
	142	#ifdef DEBUG
	143	/* Make sure the buffers are 16 byte aligned, this should be
	144	* the default behaviour of malloc in the PSPSDK.
	145	* Assume alignment. */
	146	retro_assert(((uintptr_t)in & 0xf) == 0);
	147	retro_assert(((uintptr_t)out & 0xf) == 0);
	148	#endif
	149
	150	for (i = 0; i + 8 <= samples; i += 8)
	151	{
	152	__asm__ (
	153	".set push \n"
	154	".set noreorder \n"
	155
	156	"lv.q c100, 0(%0) \n"
	157	"lv.q c110, 16(%0) \n"
	158
	159	"vf2in.q c100, c100, 31 \n"
	160	"vf2in.q c110, c110, 31 \n"
	161	"vi2s.q c100, c100 \n"
	162	"vi2s.q c102, c110 \n"
	163
	164	"sv.q c100, 0(%1) \n"
	165
	166	".set pop \n"
	167	:: "r"(in + i), "r"(out + i));
	168	}
	169	#endif
	170
	171	/* This loop converts stray samples to the right format,
	172	* but it's also a fallback in case no SIMD instructions are available. */
	173	for (; i < samples; i++)
	174	{
	175	int32_t val = (int32_t)(in[i] * 0x8000);
	176	out[i] = (val > 0x7FFF)
	177	? 0x7FFF
	178	: (val < -0x8000 ? -0x8000 : (int16_t)val);
	179	}
	180	}
	181
	182	void convert_float_to_s16_init_simd(void) { }
	183	#endif