deps/libretro-common/audio/conversion/float_to_s16.c

   1 /* Copyright  (C) 2010-2021 The RetroArch team
   2  *
   3  * ---------------------------------------------------------------------------------------
   4  * The following license statement only applies to this file (float_to_s16.c).
   5  * ---------------------------------------------------------------------------------------
   6  *
   7  * Permission is hereby granted, free of charge,
   8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation the rights to
  10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22 #include <stdint.h>
  23 #include <stddef.h>
  24
  25 #if defined(__SSE2__)
  26 #include <emmintrin.h>
  27 #elif defined(__ALTIVEC__)
  28 #include <altivec.h>
  29 #endif
  30
  31 #include <features/features_cpu.h>
  32 #include <audio/conversion/float_to_s16.h>
  33
  34 #if (defined(__ARM_NEON__) || defined(HAVE_NEON))
  35 static bool float_to_s16_neon_enabled = false;
  36 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
  37 void convert_float_s16_asm(int16_t *out,
  38       const float *in, size_t samples);
  39 #else
  40 #include <arm_neon.h>
  41 #endif
  42
  43 void convert_float_to_s16(int16_t *out,
  44       const float *in, size_t samples)
  45 {
  46    size_t i           = 0;
  47    if (float_to_s16_neon_enabled)
  48    {
  49       float        gf = (1<<15);
  50       float32x4_t vgf = {gf, gf, gf, gf};
  51       while (samples >= 8)
  52       {
  53 #ifdef HAVE_ARM_NEON_ASM_OPTIMIZATIONS
  54          size_t aligned_samples = samples & ~7;
  55          if (aligned_samples)
  56             convert_float_s16_asm(out, in, aligned_samples);
  57
  58          out                += aligned_samples;
  59          in                 += aligned_samples;
  60          samples            -= aligned_samples;
  61          i                   = 0;
  62 #else
  63          int16x4x2_t oreg;
  64          int32x4x2_t creg;
  65          float32x4x2_t inreg = vld2q_f32(in);
  66          creg.val[0]         = vcvtq_s32_f32(vmulq_f32(inreg.val[0], vgf));
  67          creg.val[1]         = vcvtq_s32_f32(vmulq_f32(inreg.val[1], vgf));
  68          oreg.val[0]         = vqmovn_s32(creg.val[0]);
  69          oreg.val[1]         = vqmovn_s32(creg.val[1]);
  70          vst2_s16(out, oreg);
  71          in                 += 8;
  72          out                += 8;
  73          samples            -= 8;
  74 #endif
  75       }
  76    }
  77
  78    for (; i < samples; i++)
  79    {
  80       int32_t val = (int32_t)(in[i] * 0x8000);
  81       out[i]      = (val > 0x7FFF) ? 0x7FFF :
  82          (val < -0x8000 ? -0x8000 : (int16_t)val);
  83    }
  84 }
  85
  86 void convert_float_to_s16_init_simd(void)
  87 {
  88    uint64_t cpu = cpu_features_get();
  89
  90    if (cpu & RETRO_SIMD_NEON)
  91       float_to_s16_neon_enabled = true;
  92 }
  93 #else
  94 void convert_float_to_s16(int16_t *out,
  95       const float *in, size_t samples)
  96 {
  97    size_t i          = 0;
  98 #if defined(__SSE2__)
  99    __m128 factor     = _mm_set1_ps((float)0x8000);
 100    /* Initialize a 4D vector with 32768.0 for its elements */
 101
 102    for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
 103    { /* Skip forward 8 samples at a time... */
 104       __m128 input_a = _mm_loadu_ps(in + 0); /* Create a 4-float vector from the next four samples... */
 105       __m128 input_b = _mm_loadu_ps(in + 4); /* ...and another from the *next* next four. */
 106       __m128 res_a   = _mm_mul_ps(input_a, factor);
 107       __m128 res_b   = _mm_mul_ps(input_b, factor); /* Multiply these samples by 32768 */
 108       __m128i ints_a = _mm_cvtps_epi32(res_a);
 109       __m128i ints_b = _mm_cvtps_epi32(res_b); /* Convert the samples to 32-bit integers */
 110       __m128i packed = _mm_packs_epi32(ints_a, ints_b); /* Then convert them to 16-bit ints, clamping to [-32768, 32767] */
 111
 112       _mm_storeu_si128((__m128i *)out, packed); /* Then put the result in the output array */
 113    }
 114
 115    samples           = samples - i;
 116    i                 = 0;
 117    /* If there are any stray samples at the end, we need to convert them
 118     * (maybe the original array didn't contain a multiple of 8 samples) */
 119 #elif defined(__ALTIVEC__)
 120    int samples_in    = samples;
 121
 122    /* Unaligned loads/store is a bit expensive,
 123     * so we optimize for the good path (very likely). */
 124    if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
 125    {
 126       size_t i;
 127       for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
 128       {
 129          vector float       input0 = vec_ld( 0, in);
 130          vector float       input1 = vec_ld(16, in);
 131          vector signed int result0 = vec_cts(input0, 15);
 132          vector signed int result1 = vec_cts(input1, 15);
 133          vec_st(vec_packs(result0, result1), 0, out);
 134       }
 135
 136       samples_in    -= i;
 137    }
 138
 139    samples           = samples_in;
 140    i                 = 0;
 141 #elif defined(_MIPS_ARCH_ALLEGREX)
 142 #ifdef DEBUG
 143    /* Make sure the buffers are 16 byte aligned, this should be
 144     * the default behaviour of malloc in the PSPSDK.
 145     * Assume alignment. */
 146    retro_assert(((uintptr_t)in  & 0xf) == 0);
 147    retro_assert(((uintptr_t)out & 0xf) == 0);
 148 #endif
 149
 150    for (i = 0; i + 8 <= samples; i += 8)
 151    {
 152       __asm__ (
 153             ".set    push                 \n"
 154             ".set    noreorder            \n"
 155
 156             "lv.q    c100,  0(%0)         \n"
 157             "lv.q    c110,  16(%0)        \n"
 158
 159             "vf2in.q c100, c100, 31       \n"
 160             "vf2in.q c110, c110, 31       \n"
 161             "vi2s.q  c100, c100           \n"
 162             "vi2s.q  c102, c110           \n"
 163
 164             "sv.q    c100,  0(%1)         \n"
 165
 166             ".set    pop                  \n"
 167             :: "r"(in + i), "r"(out + i));
 168    }
 169 #endif
 170
 171    /* This loop converts stray samples to the right format,
 172     * but it's also a fallback in case no SIMD instructions are available. */
 173    for (; i < samples; i++)
 174    {
 175       int32_t val    = (int32_t)(in[i] * 0x8000);
 176       out[i]         = (val > 0x7FFF)
 177          ? 0x7FFF
 178          : (val < -0x8000 ? -0x8000 : (int16_t)val);
 179    }
 180 }
 181
 182 void convert_float_to_s16_init_simd(void) { }
 183 #endif