[pcsx_rearmed.git] / scaler_int.c

/* Copyright  (C) 2010-2020 The RetroArch team
 *
 * ---------------------------------------------------------------------------------------
 * The following license statement only applies to this file (scaler_int.c).
 * ---------------------------------------------------------------------------------------
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <gfx/scaler/scaler_int.h>

#include <retro_inline.h>

#ifdef SCALER_NO_SIMD
#undef __SSE2__
#endif

#if defined(__SSE2__)
#include <emmintrin.h>
#ifdef _WIN32
#include <intrin.h>
#endif
#endif

/* ARGB8888 scaler is split in two:
 *
 * First, horizontal scaler is applied.
 * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
 * to left to occupy 15 bits.
 *
 * The sign bit is kept empty as we have to do signed multiplication for the
 * filter.
 *
 * A mulhi [(a * b) >> 16] is applied which loses some precision, but is
 * very efficient for SIMD.
 * It is accurate enough for 8-bit purposes.
 *
 * The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
 * the output is kept with 16-bit channels, and will now have 13 bits
 * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
 *
 * Vertical scaler takes the 13 bit channels, and performs the
 * same mulhi steps.
 * Another 2 bits of precision is lost, which ends up as 11 bits.
 * Scaling is now complete. Channels are shifted right by 3, and saturated
 * into 8-bit values.
 *
 * The C version of scalers perform the exact same operations as the
 * SIMD code for testing purposes.
 */

void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
{
   int h, w, y;
   const uint64_t      *input = ctx->scaled.frame;
   uint32_t           *output = (uint32_t*)output_;

   const int16_t *filter_vert = ctx->vert.filter;

   for (h = 0; h < ctx->out_height; h++,
         filter_vert += ctx->vert.filter_stride, output += stride >> 2)
   {
      const uint64_t *input_base = input + ctx->vert.filter_pos[h]
         * (ctx->scaled.stride >> 3);

      for (w = 0; w < ctx->out_width; w++)
      {
         const uint64_t *input_base_y = input_base + w;
#if defined(__SSE2__)
         __m128i final;
         __m128i res = _mm_setzero_si128();

         for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
               input_base_y += (ctx->scaled.stride >> 2))
         {
            __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
            __m128i col   = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);

            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
         }

         for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
         {
            __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
            __m128i col   = _mm_set_epi64x(0, input_base_y[0]);

            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
         }

         res       = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
         res       = _mm_srai_epi16(res, (7 - 2 - 2));

         final     = _mm_packus_epi16(res, res);

         output[w] = _mm_cvtsi128_si32(final);
#else
         int16_t res_a = 0;
         int16_t res_r = 0;
         int16_t res_g = 0;
         int16_t res_b = 0;

         for (y = 0; y < ctx->vert.filter_len; y++,
               input_base_y += (ctx->scaled.stride >> 3))
         {
            uint64_t col   = *input_base_y;

            int16_t a      = (col >> 48) & 0xffff;
            int16_t r      = (col >> 32) & 0xffff;
            int16_t g      = (col >> 16) & 0xffff;
            int16_t b      = (col >>  0) & 0xffff;

            int16_t coeff  = filter_vert[y];

            res_a         += (a * coeff) >> 16;
            res_r         += (r * coeff) >> 16;
            res_g         += (g * coeff) >> 16;
            res_b         += (b * coeff) >> 16;
         }

         res_a           >>= (7 - 2 - 2);
         res_r           >>= (7 - 2 - 2);
         res_g           >>= (7 - 2 - 2);
         res_b           >>= (7 - 2 - 2);

         output[w]         =
            (clamp_8bit(res_a) << 24) |
            (clamp_8bit(res_r) << 16) |
            (clamp_8bit(res_g) << 8)  |
            (clamp_8bit(res_b) << 0);
#endif
      }
   }
}

void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
{
   int h, w, x;
   const uint32_t *input = (uint32_t*)input_;
   uint64_t *output      = ctx->scaled.frame;

   for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
         output += ctx->scaled.stride >> 3)
   {
      const int16_t *filter_horiz = ctx->horiz.filter;

      for (w = 0; w < ctx->scaled.width; w++,
            filter_horiz += ctx->horiz.filter_stride)
      {
         const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
#if defined(__SSE2__)
         __m128i res = _mm_setzero_si128();
#ifndef __x86_64__
         union
         {
            uint32_t *u32;
            uint64_t *u64;
         } u;
#endif
         for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2)
         {
            __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);

            __m128i col   = _mm_unpacklo_epi8(_mm_set_epi64x(0,
                     ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());

            col           = _mm_slli_epi16(col, 7);
            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
         }

         for (; x < ctx->horiz.filter_len; x++)
         {
            __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
            __m128i col   = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());

            col           = _mm_slli_epi16(col, 7);
            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
         }

         res              = _mm_adds_epi16(_mm_srli_si128(res, 8), res);

#ifdef __x86_64__
         output[w]        = _mm_cvtsi128_si64(res);
#else /* 32-bit doesn't have si64. Do it in two steps. */
         u.u64    = output + w;
         u.u32[0] = _mm_cvtsi128_si32(res);
         u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4));
#endif
#else
         int16_t res_a = 0;
         int16_t res_r = 0;
         int16_t res_g = 0;
         int16_t res_b = 0;

         for (x = 0; x < ctx->horiz.filter_len; x++)
         {
            uint32_t col   = input_base_x[x];

            int16_t a      = (col >> (24 - 7)) & (0xff << 7);
            int16_t r      = (col >> (16 - 7)) & (0xff << 7);
            int16_t g      = (col >> ( 8 - 7)) & (0xff << 7);
            int16_t b      = (col << ( 0 + 7)) & (0xff << 7);

            int16_t coeff  = filter_horiz[x];

            res_a         += (a * coeff) >> 16;
            res_r         += (r * coeff) >> 16;
            res_g         += (g * coeff) >> 16;
            res_b         += (b * coeff) >> 16;
         }

         output[w]         = (
               (uint64_t)res_a  << 48)  |
               ((uint64_t)res_r << 32)  |
               ((uint64_t)res_g << 16)  |
               ((uint64_t)res_b << 0);
#endif
      }
   }
}

void scaler_argb8888_point_special(const struct scaler_ctx *ctx,
      void *output_, const void *input_,
      int out_width, int out_height,
      int in_width, int in_height,
      int out_stride, int in_stride)
{
   int h, w;
   int x_pos             = (1 << 15) * in_width / out_width - (1 << 15);
   int x_step            = (1 << 16) * in_width / out_width;
   int y_pos             = (1 << 15) * in_height / out_height - (1 << 15);
   int y_step            = (1 << 16) * in_height / out_height;
   const uint32_t *input = (const uint32_t*)input_;
   uint32_t *output      = (uint32_t*)output_;

   if (x_pos < 0)
      x_pos = 0;
   if (y_pos < 0)
      y_pos = 0;

   for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2)
   {
      int               x = x_pos;
      const uint32_t *inp = input + (y_pos >> 16) * (in_stride >> 2);

      for (w = 0; w < out_width; w++, x += x_step)
         output[w] = inp[x >> 16];
   }
}
Commit	Line	Data
	1	/* Copyright (C) 2010-2020 The RetroArch team
	2	*
	3	* ---------------------------------------------------------------------------------------
	4	* The following license statement only applies to this file (scaler_int.c).
	5	* ---------------------------------------------------------------------------------------
	6	*
	7	* Permission is hereby granted, free of charge,
	8	* to any person obtaining a copy of this software and associated documentation files (the "Software"),
	9	* to deal in the Software without restriction, including without limitation the rights to
	10	* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
	11	* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
	14	*
	15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
	16	* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	18	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	19	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	20	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	21	*/
	22
	23	#include <gfx/scaler/scaler_int.h>
	24
	25	#include <retro_inline.h>
	26
	27	#ifdef SCALER_NO_SIMD
	28	#undef __SSE2__
	29	#endif
	30
	31	#if defined(__SSE2__)
	32	#include <emmintrin.h>
	33	#ifdef _WIN32
	34	#include <intrin.h>
	35	#endif
	36	#endif
	37
	38	/* ARGB8888 scaler is split in two:
	39	*
	40	* First, horizontal scaler is applied.
	41	* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
	42	* to left to occupy 15 bits.
	43	*
	44	* The sign bit is kept empty as we have to do signed multiplication for the
	45	* filter.
	46	*
	47	* A mulhi [(a * b) >> 16] is applied which loses some precision, but is
	48	* very efficient for SIMD.
	49	* It is accurate enough for 8-bit purposes.
	50	*
	51	* The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
	52	* the output is kept with 16-bit channels, and will now have 13 bits
	53	* of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
	54	*
	55	* Vertical scaler takes the 13 bit channels, and performs the
	56	* same mulhi steps.
	57	* Another 2 bits of precision is lost, which ends up as 11 bits.
	58	* Scaling is now complete. Channels are shifted right by 3, and saturated
	59	* into 8-bit values.
	60	*
	61	* The C version of scalers perform the exact same operations as the
	62	* SIMD code for testing purposes.
	63	*/
	64
	65	void scaler_argb8888_vert(const struct scaler_ctx ctx, void output_, int stride)
	66	{
	67	int h, w, y;
	68	const uint64_t *input = ctx->scaled.frame;
	69	uint32_t output = (uint32_t)output_;
	70
	71	const int16_t *filter_vert = ctx->vert.filter;
	72
	73	for (h = 0; h < ctx->out_height; h++,
	74	filter_vert += ctx->vert.filter_stride, output += stride >> 2)
	75	{
	76	const uint64_t *input_base = input + ctx->vert.filter_pos[h]
	77	* (ctx->scaled.stride >> 3);
	78
	79	for (w = 0; w < ctx->out_width; w++)
	80	{
	81	const uint64_t *input_base_y = input_base + w;
	82	#if defined(__SSE2__)
	83	__m128i final;
	84	__m128i res = _mm_setzero_si128();
	85
	86	for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
	87	input_base_y += (ctx->scaled.stride >> 2))
	88	{
	89	__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
	90	__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
	91
	92	res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
	93	}
	94
	95	for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
	96	{
	97	__m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
	98	__m128i col = _mm_set_epi64x(0, input_base_y[0]);
	99
	100	res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
	101	}
	102
	103	res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
	104	res = _mm_srai_epi16(res, (7 - 2 - 2));
	105
	106	final = _mm_packus_epi16(res, res);
	107
	108	output[w] = _mm_cvtsi128_si32(final);
	109	#else
	110	int16_t res_a = 0;
	111	int16_t res_r = 0;
	112	int16_t res_g = 0;
	113	int16_t res_b = 0;
	114
	115	for (y = 0; y < ctx->vert.filter_len; y++,
	116	input_base_y += (ctx->scaled.stride >> 3))
	117	{
	118	uint64_t col = *input_base_y;
	119
	120	int16_t a = (col >> 48) & 0xffff;
	121	int16_t r = (col >> 32) & 0xffff;
	122	int16_t g = (col >> 16) & 0xffff;
	123	int16_t b = (col >> 0) & 0xffff;
	124
	125	int16_t coeff = filter_vert[y];
	126
	127	res_a += (a * coeff) >> 16;
	128	res_r += (r * coeff) >> 16;
	129	res_g += (g * coeff) >> 16;
	130	res_b += (b * coeff) >> 16;
	131	}
	132
	133	res_a >>= (7 - 2 - 2);
	134	res_r >>= (7 - 2 - 2);
	135	res_g >>= (7 - 2 - 2);
	136	res_b >>= (7 - 2 - 2);
	137
	138	output[w] =
	139	(clamp_8bit(res_a) << 24) \|
	140	(clamp_8bit(res_r) << 16) \|
	141	(clamp_8bit(res_g) << 8) \|
	142	(clamp_8bit(res_b) << 0);
	143	#endif
	144	}
	145	}
	146	}
	147
	148	void scaler_argb8888_horiz(const struct scaler_ctx ctx, const void input_, int stride)
	149	{
	150	int h, w, x;
	151	const uint32_t input = (uint32_t)input_;
	152	uint64_t *output = ctx->scaled.frame;
	153
	154	for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
	155	output += ctx->scaled.stride >> 3)
	156	{
	157	const int16_t *filter_horiz = ctx->horiz.filter;
	158
	159	for (w = 0; w < ctx->scaled.width; w++,
	160	filter_horiz += ctx->horiz.filter_stride)
	161	{
	162	const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
	163	#if defined(__SSE2__)
	164	__m128i res = _mm_setzero_si128();
	165	#ifndef __x86_64__
	166	union
	167	{
	168	uint32_t *u32;
	169	uint64_t *u64;
	170	} u;
	171	#endif
	172	for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2)
	173	{
	174	__m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
	175
	176	__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
	177	((uint64_t)input_base_x[x + 1] << 32) \| input_base_x[x + 0]), _mm_setzero_si128());
	178
	179	col = _mm_slli_epi16(col, 7);
	180	res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
	181	}
	182
	183	for (; x < ctx->horiz.filter_len; x++)
	184	{
	185	__m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
	186	__m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
	187
	188	col = _mm_slli_epi16(col, 7);
	189	res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
	190	}
	191
	192	res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
	193
	194	#ifdef __x86_64__
	195	output[w] = _mm_cvtsi128_si64(res);
	196	#else /* 32-bit doesn't have si64. Do it in two steps. */
	197	u.u64 = output + w;
	198	u.u32[0] = _mm_cvtsi128_si32(res);
	199	u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4));
	200	#endif
	201	#else
	202	int16_t res_a = 0;
	203	int16_t res_r = 0;
	204	int16_t res_g = 0;
	205	int16_t res_b = 0;
	206
	207	for (x = 0; x < ctx->horiz.filter_len; x++)
	208	{
	209	uint32_t col = input_base_x[x];
	210
	211	int16_t a = (col >> (24 - 7)) & (0xff << 7);
	212	int16_t r = (col >> (16 - 7)) & (0xff << 7);
	213	int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
	214	int16_t b = (col << ( 0 + 7)) & (0xff << 7);
	215
	216	int16_t coeff = filter_horiz[x];
	217
	218	res_a += (a * coeff) >> 16;
	219	res_r += (r * coeff) >> 16;
	220	res_g += (g * coeff) >> 16;
	221	res_b += (b * coeff) >> 16;
	222	}
	223
	224	output[w] = (
	225	(uint64_t)res_a << 48) \|
	226	((uint64_t)res_r << 32) \|
	227	((uint64_t)res_g << 16) \|
	228	((uint64_t)res_b << 0);
	229	#endif
	230	}
	231	}
	232	}
	233
	234	void scaler_argb8888_point_special(const struct scaler_ctx *ctx,
	235	void output_, const void input_,
	236	int out_width, int out_height,
	237	int in_width, int in_height,
	238	int out_stride, int in_stride)
	239	{
	240	int h, w;
	241	int x_pos = (1 << 15) * in_width / out_width - (1 << 15);
	242	int x_step = (1 << 16) * in_width / out_width;
	243	int y_pos = (1 << 15) * in_height / out_height - (1 << 15);
	244	int y_step = (1 << 16) * in_height / out_height;
	245	const uint32_t input = (const uint32_t)input_;
	246	uint32_t output = (uint32_t)output_;
	247
	248	if (x_pos < 0)
	249	x_pos = 0;
	250	if (y_pos < 0)
	251	y_pos = 0;
	252
	253	for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2)
	254	{
	255	int x = x_pos;
	256	const uint32_t inp = input + (y_pos >> 16) (in_stride >> 2);
	257
	258	for (w = 0; w < out_width; w++, x += x_step)
	259	output[w] = inp[x >> 16];
	260	}
	261	}